Roman_applicant-site/app/Services/PageScrapper.php

72 lines
2.0 KiB
PHP
Raw Permalink Normal View History

2024-01-22 17:08:45 +03:00
<?php
namespace App\Services;
2024-01-26 14:45:33 +03:00
use DiDom\Document;
class PageScrapper
{
private string $url;
private string $contentMarker;
2024-02-12 14:56:24 +03:00
private Document $document;
2024-01-26 14:45:33 +03:00
public function __construct($url, $contentMarker)
{
$this->url = $url;
$this->contentMarker = $contentMarker;
$this->document = new Document($this->url, true);
}
2024-02-12 14:56:24 +03:00
2024-01-26 14:45:33 +03:00
public function getFullHTML()
{
return $this->document;
}
2024-02-12 14:56:24 +03:00
2024-01-26 14:45:33 +03:00
public function printHTML()
{
$rez = $this->document;
$content = $rez->first($this->contentMarker)->html();
return $content;
}
2024-02-12 14:56:24 +03:00
2024-01-26 14:45:33 +03:00
public function normalizePath()
{
2024-01-29 17:25:49 +03:00
$rez = $this->document;
$content = $rez->first($this->contentMarker);
$links = $content->find('a[href]');
$srclinks = $content->find('img[src]');
$html0 = $content->html();
2024-01-31 15:42:53 +03:00
preg_match_all('/<a href="(.*)">/isU', $html0, $arr);
foreach ($arr[1] as $el) {
$html0 = str_replace($el, urldecode($el), $html0);
}
preg_match_all('/<img src="(.*)">/isU', $html0, $arr2);
foreach ($arr2[1] as $el) {
$html0 = str_replace($el, urldecode($el), $html0);
}
2024-01-29 17:25:49 +03:00
foreach ($links as $k => $link) {
$href = $link->attr('href');
if (!str_contains($link->attr('href'), "https://")) {
2024-01-31 15:42:53 +03:00
$unchanged = $link->attr('href');
$changed = $link->href = 'https://mkgtu.ru' . $href;
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
2024-01-26 14:45:33 +03:00
}
}
2024-01-29 17:25:49 +03:00
2024-01-31 15:42:53 +03:00
foreach ($srclinks as $k => $srclink) {
$src = $srclink->attr('src');
if (!str_contains($srclink->attr('src'), "https://")) {
$unchanged = $srclink->attr('src');
$changed = $srclink->src = 'https://mkgtu.ru' . $src;
$html0 = str_replace(urldecode($unchanged), $changed, $html0);
2024-01-29 17:25:49 +03:00
}
}
return $html0;
2024-01-26 14:45:33 +03:00
}
}