fixing Scrapper
This commit is contained in:
parent
7a585aa3cd
commit
72191cbb16
|
@ -93,41 +93,56 @@ class PageScrapper
|
||||||
$links = $content->find('a[href]');
|
$links = $content->find('a[href]');
|
||||||
$srclinks = $content->find('img[src]');
|
$srclinks = $content->find('img[src]');
|
||||||
|
|
||||||
|
|
||||||
$html0 = $content->html();
|
$html0 = $content->html();
|
||||||
|
|
||||||
|
preg_match_all('/<a href="(.*)">/isU', $html0, $arr);
|
||||||
|
foreach ($arr[1] as $el) {
|
||||||
|
$html0 = str_replace($el, urldecode($el), $html0);
|
||||||
|
}
|
||||||
|
preg_match_all('/<img src="(.*)">/isU', $html0, $arr2);
|
||||||
|
foreach ($arr2[1] as $el) {
|
||||||
|
$html0 = str_replace($el, urldecode($el), $html0);
|
||||||
|
}
|
||||||
|
|
||||||
foreach ($links as $k => $link) {
|
foreach ($links as $k => $link) {
|
||||||
$href = $link->attr('href');
|
$href = $link->attr('href');
|
||||||
|
|
||||||
if (!str_contains($link->attr('href'), "https://")) {
|
if (!str_contains($link->attr('href'), "https://")) {
|
||||||
|
$unchanged = $link->attr('href');
|
||||||
|
$changed = $link->href = 'https://mkgtu.ru' . $href;
|
||||||
$tmp = explode('/', rawurldecode($href));
|
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
|
||||||
foreach ( $tmp as $k => $v) {
|
|
||||||
$tmp[$k] = rawurlencode($v);
|
|
||||||
}
|
|
||||||
$href = implode('/', $tmp);
|
|
||||||
|
|
||||||
|
|
||||||
$html0 = str_replace($href, 'https://mkgtu.ru' . $href, $html0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
foreach ($srclinks as $k => $srclink) {
|
foreach ($srclinks as $k => $srclink) {
|
||||||
$src = $srclink->attr('src');
|
$src = $srclink->attr('src');
|
||||||
|
|
||||||
if (!str_contains($srclink->attr('src'), "https://")) {
|
if (!str_contains($srclink->attr('src'), "https://")) {
|
||||||
|
$unchanged = $srclink->attr('src');
|
||||||
|
$changed = $srclink->src = 'https://mkgtu.ru' . $src;
|
||||||
$tmp = explode('/', rawurldecode($src));
|
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
|
||||||
foreach ( $tmp as $k => $v) {
|
|
||||||
$tmp[$k] = rawurlencode($v);
|
|
||||||
}
|
|
||||||
$src = implode('/', $tmp);
|
|
||||||
|
|
||||||
|
|
||||||
$html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
str_replace('st yle', 'style', $html0);
|
// foreach ($srclinks as $k => $srclink) {
|
||||||
|
// $src = $srclink->attr('src');
|
||||||
|
//
|
||||||
|
// if (!str_contains($srclink->attr('src'), "https://")) {
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// $tmp = explode('/', rawurldecode($src));
|
||||||
|
// foreach ( $tmp as $k => $v) {
|
||||||
|
// $tmp[$k] = rawurlencode($v);
|
||||||
|
// }
|
||||||
|
// $src = implode('/', $tmp);
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// $html0 = str_replace($src, 'https://mkgtu.ru' . $src, $html0);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// str_replace('st yle', 'style', $html0);
|
||||||
|
|
||||||
return $html0;
|
return $html0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue