2024-01-22 17:08:45 +03:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace App\Services;
|
|
|
|
|
2024-01-26 14:45:33 +03:00
|
|
|
use DiDom\Document;
|
|
|
|
|
|
|
|
/*
|
2024-01-22 17:08:45 +03:00
|
|
|
class PageScrapper
|
|
|
|
{
|
|
|
|
private string $url;
|
|
|
|
private string $contentMarker;
|
|
|
|
|
|
|
|
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
|
|
|
|
{
|
|
|
|
$this->url = $url;
|
|
|
|
$this->contentMarker = $contentMarker;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getHTML()
|
|
|
|
{
|
2024-01-23 15:45:52 +03:00
|
|
|
|
2024-01-22 17:08:45 +03:00
|
|
|
$page = file_get_contents($this->url);
|
2024-01-23 15:45:52 +03:00
|
|
|
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";
|
|
|
|
|
2024-01-22 17:08:45 +03:00
|
|
|
$arr = [];
|
|
|
|
$rez = preg_match_all($strForPregMatch, $page, $arr);
|
2024-01-23 15:45:52 +03:00
|
|
|
|
2024-01-26 16:05:28 +03:00
|
|
|
return $content = $arr[0][0];
|
2024-01-22 17:08:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
public function normalizeURLFile($content)
|
|
|
|
{
|
2024-01-24 13:30:27 +03:00
|
|
|
|
2024-01-25 08:59:34 +03:00
|
|
|
$rez = preg_match_all('/<a href="(.*)">/isU', $content, $arr);
|
2024-01-24 13:30:27 +03:00
|
|
|
$arr[1] = array_unique($arr[1]);
|
2024-01-22 17:08:45 +03:00
|
|
|
foreach ($arr[1] as $el) {
|
2024-01-25 08:59:34 +03:00
|
|
|
if (!str_starts_with($el, 'https')) {
|
|
|
|
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
|
2024-01-22 17:08:45 +03:00
|
|
|
}
|
|
|
|
}
|
2024-01-25 12:57:38 +03:00
|
|
|
$rez = preg_match_all('/src="(.*)">/isU', $content, $arr);
|
|
|
|
$arr[1] = array_unique($arr[1]);
|
|
|
|
foreach ($arr[1] as $el) {
|
2024-01-29 09:44:53 +03:00
|
|
|
if (!str_starts_with($el, 'https') && str_contains($el, 'upload')) {
|
2024-01-25 12:57:38 +03:00
|
|
|
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
|
|
|
|
}
|
|
|
|
}
|
2024-01-24 15:23:23 +03:00
|
|
|
|
2024-01-22 17:08:45 +03:00
|
|
|
return $content;
|
2024-01-24 13:30:27 +03:00
|
|
|
}
|
2024-01-25 08:59:34 +03:00
|
|
|
public function cutHTML($content, $strForScissors)
|
2024-01-24 13:30:27 +03:00
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
$arr = [];
|
2024-01-25 14:01:05 +03:00
|
|
|
//<footer(.*)<\/footer>
|
|
|
|
//safdsaf sdfdasf<footer>--------------------------------fsdfdasf <\/footer> asdfdasf asdf
|
2024-01-24 13:30:27 +03:00
|
|
|
$rez = preg_match_all($strForScissors, $content, $arr);
|
2024-01-25 14:01:05 +03:00
|
|
|
//$arr[1][0] = '>--------------------------------fsdfdasf ';
|
|
|
|
$content = str_replace($arr[0], '', $content);
|
|
|
|
//safdsaf sdfdasf<footer<\/footer> asdfdasf asdf
|
2024-01-24 13:30:27 +03:00
|
|
|
|
|
|
|
return $content;
|
2024-01-22 17:08:45 +03:00
|
|
|
}
|
|
|
|
}
|
2024-01-26 14:45:33 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
class PageScrapper
|
|
|
|
{
|
|
|
|
private string $url;
|
|
|
|
private string $contentMarker;
|
|
|
|
|
|
|
|
public function __construct($url, $contentMarker)
|
|
|
|
{
|
|
|
|
$this->url = $url;
|
|
|
|
$this->contentMarker = $contentMarker;
|
|
|
|
$this->document = new Document($this->url, true);
|
|
|
|
}
|
|
|
|
public function getFullHTML()
|
|
|
|
{
|
|
|
|
return $this->document;
|
|
|
|
}
|
|
|
|
public function printHTML()
|
|
|
|
{
|
|
|
|
$rez = $this->document;
|
|
|
|
$content = $rez->first($this->contentMarker)->html();
|
|
|
|
return $content;
|
|
|
|
}
|
|
|
|
public function normalizePath()
|
|
|
|
{
|
2024-01-29 17:25:49 +03:00
|
|
|
$rez = $this->document;
|
|
|
|
$content = $rez->first($this->contentMarker);
|
|
|
|
$links = $content->find('a[href]');
|
|
|
|
$srclinks = $content->find('img[src]');
|
|
|
|
|
|
|
|
$html0 = $content->html();
|
2024-01-31 15:42:53 +03:00
|
|
|
|
|
|
|
preg_match_all('/<a href="(.*)">/isU', $html0, $arr);
|
|
|
|
foreach ($arr[1] as $el) {
|
|
|
|
$html0 = str_replace($el, urldecode($el), $html0);
|
|
|
|
}
|
|
|
|
preg_match_all('/<img src="(.*)">/isU', $html0, $arr2);
|
|
|
|
foreach ($arr2[1] as $el) {
|
|
|
|
$html0 = str_replace($el, urldecode($el), $html0);
|
|
|
|
}
|
|
|
|
|
2024-01-29 17:25:49 +03:00
|
|
|
foreach ($links as $k => $link) {
|
|
|
|
$href = $link->attr('href');
|
|
|
|
if (!str_contains($link->attr('href'), "https://")) {
|
2024-01-31 15:42:53 +03:00
|
|
|
$unchanged = $link->attr('href');
|
|
|
|
$changed = $link->href = 'https://mkgtu.ru' . $href;
|
|
|
|
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
|
2024-01-26 14:45:33 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-29 17:25:49 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-01-31 15:42:53 +03:00
|
|
|
|
|
|
|
foreach ($srclinks as $k => $srclink) {
|
|
|
|
$src = $srclink->attr('src');
|
|
|
|
if (!str_contains($srclink->attr('src'), "https://")) {
|
|
|
|
$unchanged = $srclink->attr('src');
|
|
|
|
$changed = $srclink->src = 'https://mkgtu.ru' . $src;
|
|
|
|
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
|
2024-01-29 17:25:49 +03:00
|
|
|
}
|
|
|
|
}
|
2024-01-31 15:42:53 +03:00
|
|
|
// foreach ($srclinks as $k => $srclink) {
|
|
|
|
// $src = $srclink->attr('src');
|
|
|
|
//
|
|
|
|
// if (!str_contains($srclink->attr('src'), "https://")) {
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// $tmp = explode('/', rawurldecode($src));
|
|
|
|
// foreach ( $tmp as $k => $v) {
|
|
|
|
// $tmp[$k] = rawurlencode($v);
|
|
|
|
// }
|
|
|
|
// $src = implode('/', $tmp);
|
|
|
|
//
|
|
|
|
//
|
|
|
|
// $html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// str_replace('st yle', 'style', $html0);
|
2024-01-29 17:25:49 +03:00
|
|
|
|
|
|
|
return $html0;
|
2024-01-26 14:45:33 +03:00
|
|
|
}
|
|
|
|
}
|