Compare commits

...

2 Commits

Author SHA1 Message Date
aslan bb1f5916e4 Merge pull request 'fixing Scrapper' (#12) from RomanGolienko/Roman_applicant-site:main into main
Tests & Lint & Deploy to Railway / deploy (8.1) (push) Blocked by required conditions Details
Tests & Lint & Deploy to Railway / build (20.x, 8.2) (push) Has been cancelled Details
Reviewed-on: http://172.17.254.104/aslan/applicant-site/pulls/12
2024-01-31 15:44:04 +03:00
ROMANGOLIENKO 72191cbb16 fixing Scrapper
Tests & Lint & Deploy to Railway / build (20.x, 8.2) (pull_request) Failing after 1m35s Details
Tests & Lint & Deploy to Railway / deploy (8.1) (pull_request) Has been skipped Details
2024-01-31 15:42:53 +03:00
1 changed files with 39 additions and 24 deletions

View File

@ -93,41 +93,56 @@ class PageScrapper
$links = $content->find('a[href]'); $links = $content->find('a[href]');
$srclinks = $content->find('img[src]'); $srclinks = $content->find('img[src]');
$html0 = $content->html(); $html0 = $content->html();
preg_match_all('/<a href="(.*)">/isU', $html0, $arr);
foreach ($arr[1] as $el) {
$html0 = str_replace($el, urldecode($el), $html0);
}
preg_match_all('/<img src="(.*)">/isU', $html0, $arr2);
foreach ($arr2[1] as $el) {
$html0 = str_replace($el, urldecode($el), $html0);
}
foreach ($links as $k => $link) { foreach ($links as $k => $link) {
$href = $link->attr('href'); $href = $link->attr('href');
if (!str_contains($link->attr('href'), "https://")) { if (!str_contains($link->attr('href'), "https://")) {
$unchanged = $link->attr('href');
$changed = $link->href = 'https://mkgtu.ru' . $href;
$tmp = explode('/', rawurldecode($href)); $html0 = str_replace(urldecode($unchanged), $changed, $html0);
foreach ( $tmp as $k => $v) {
$tmp[$k] = rawurlencode($v);
}
$href = implode('/', $tmp);
$html0 = str_replace($href, 'https://mkgtu.ru' . $href, $html0);
} }
} }
foreach ($srclinks as $k => $srclink) { foreach ($srclinks as $k => $srclink) {
$src = $srclink->attr('src'); $src = $srclink->attr('src');
if (!str_contains($srclink->attr('src'), "https://")) { if (!str_contains($srclink->attr('src'), "https://")) {
$unchanged = $srclink->attr('src');
$changed = $srclink->src = 'https://mkgtu.ru' . $src;
$tmp = explode('/', rawurldecode($src)); $html0 = str_replace(urldecode($unchanged), $changed, $html0);
foreach ( $tmp as $k => $v) {
$tmp[$k] = rawurlencode($v);
}
$src = implode('/', $tmp);
$html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
} }
} }
str_replace('st yle', 'style', $html0); // foreach ($srclinks as $k => $srclink) {
// $src = $srclink->attr('src');
//
// if (!str_contains($srclink->attr('src'), "https://")) {
//
//
// $tmp = explode('/', rawurldecode($src));
// foreach ( $tmp as $k => $v) {
// $tmp[$k] = rawurlencode($v);
// }
// $src = implode('/', $tmp);
//
//
// $html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
// }
// }
// str_replace('st yle', 'style', $html0);
return $html0; return $html0;
} }