applicant-site/app/Services/PageScrapper.php

135 lines
3.5 KiB
PHP

<?php
namespace App\Services;
use DiDom\Document;
/*
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
{
$this->url = $url;
$this->contentMarker = $contentMarker;
}
public function getHTML()
{
$page = file_get_contents($this->url);
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";
$arr = [];
$rez = preg_match_all($strForPregMatch, $page, $arr);
return $content = $arr[0][0];
}
public function normalizeURLFile($content)
{
$rez = preg_match_all('/<a href="(.*)">/isU', $content, $arr);
$arr[1] = array_unique($arr[1]);
foreach ($arr[1] as $el) {
if (!str_starts_with($el, 'https')) {
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
}
}
$rez = preg_match_all('/src="(.*)">/isU', $content, $arr);
$arr[1] = array_unique($arr[1]);
foreach ($arr[1] as $el) {
if (!str_starts_with($el, 'https') && str_contains($el, 'upload')) {
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
}
}
return $content;
}
public function cutHTML($content, $strForScissors)
{
$arr = [];
//<footer(.*)<\/footer>
//safdsaf sdfdasf<footer>--------------------------------fsdfdasf <\/footer> asdfdasf asdf
$rez = preg_match_all($strForScissors, $content, $arr);
//$arr[1][0] = '>--------------------------------fsdfdasf ';
$content = str_replace($arr[0], '', $content);
//safdsaf sdfdasf<footer<\/footer> asdfdasf asdf
return $content;
}
}
*/
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker)
{
$this->url = $url;
$this->contentMarker = $contentMarker;
$this->document = new Document($this->url, true);
}
public function getFullHTML()
{
return $this->document;
}
public function printHTML()
{
$rez = $this->document;
$content = $rez->first($this->contentMarker)->html();
return $content;
}
public function normalizePath()
{
$rez = $this->document;
$content = $rez->first($this->contentMarker);
$links = $content->find('a[href]');
$srclinks = $content->find('img[src]');
$html0 = $content->html();
foreach ($links as $k => $link) {
$href = $link->attr('href');
if (!str_contains($link->attr('href'), "https://")) {
$tmp = explode('/', rawurldecode($href));
foreach ( $tmp as $k => $v) {
$tmp[$k] = rawurlencode($v);
}
$href = implode('/', $tmp);
$html0 = str_replace($href, 'https://mkgtu.ru' . $href, $html0);
}
}
foreach ($srclinks as $k => $srclink) {
$src = $srclink->attr('src');
if (!str_contains($srclink->attr('src'), "https://")) {
$tmp = explode('/', rawurldecode($src));
foreach ( $tmp as $k => $v) {
$tmp[$k] = rawurlencode($v);
}
$src = implode('/', $tmp);
$html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
}
}
str_replace('st yle', 'style', $html0);
return $html0;
}
}