From 72191cbb166fe11c92908be84a5d96eb00c3bfa6 Mon Sep 17 00:00:00 2001 From: ROMANGOLIENKO Date: Wed, 31 Jan 2024 15:42:53 +0300 Subject: [PATCH] fixing Scrapper --- app/Services/PageScrapper.php | 63 ++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/app/Services/PageScrapper.php b/app/Services/PageScrapper.php index 46b4880..bc479ad 100644 --- a/app/Services/PageScrapper.php +++ b/app/Services/PageScrapper.php @@ -93,41 +93,56 @@ class PageScrapper $links = $content->find('a[href]'); $srclinks = $content->find('img[src]'); - $html0 = $content->html(); + + preg_match_all('//isU', $html0, $arr); + foreach ($arr[1] as $el) { + $html0 = str_replace($el, urldecode($el), $html0); + } + preg_match_all('//isU', $html0, $arr2); + foreach ($arr2[1] as $el) { + $html0 = str_replace($el, urldecode($el), $html0); + } + foreach ($links as $k => $link) { $href = $link->attr('href'); - if (!str_contains($link->attr('href'), "https://")) { - - - $tmp = explode('/', rawurldecode($href)); - foreach ( $tmp as $k => $v) { - $tmp[$k] = rawurlencode($v); - } - $href = implode('/', $tmp); - - - $html0 = str_replace($href, 'https://mkgtu.ru' . $href, $html0); + $unchanged = $link->attr('href'); + $changed = $link->href = 'https://mkgtu.ru' . $href; + $html0 = str_replace(urldecode($unchanged), $changed, $html0); } } + + + + + + foreach ($srclinks as $k => $srclink) { $src = $srclink->attr('src'); - if (!str_contains($srclink->attr('src'), "https://")) { - - - $tmp = explode('/', rawurldecode($src)); - foreach ( $tmp as $k => $v) { - $tmp[$k] = rawurlencode($v); - } - $src = implode('/', $tmp); - - - $html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0); + $unchanged = $srclink->attr('src'); + $changed = $srclink->src = 'https://mkgtu.ru' . $src; + $html0 = str_replace(urldecode($unchanged), $changed, $html0); } } - str_replace('st yle', 'style', $html0); +// foreach ($srclinks as $k => $srclink) { +// $src = $srclink->attr('src'); +// +// if (!str_contains($srclink->attr('src'), "https://")) { +// +// +// $tmp = explode('/', rawurldecode($src)); +// foreach ( $tmp as $k => $v) { +// $tmp[$k] = rawurlencode($v); +// } +// $src = implode('/', $tmp); +// +// +// $html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0); +// } +// } +// str_replace('st yle', 'style', $html0); return $html0; }