newScrapper

This commit is contained in:
ROMANGOLIENKO 2024-01-26 14:45:33 +03:00
parent 02e6ad458c
commit d24f32623b
3 changed files with 61 additions and 15 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
/public/img/icons/play-button.png
/.phpunit.cache /.phpunit.cache
/node_modules /node_modules
/public/build /public/build

View File

@ -2,6 +2,9 @@
namespace App\Services; namespace App\Services;
use DiDom\Document;
/*
class PageScrapper class PageScrapper
{ {
private string $url; private string $url;
@ -60,3 +63,39 @@ class PageScrapper
return $content; return $content;
} }
} }
*/
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker)
{
$this->url = $url;
$this->contentMarker = $contentMarker;
$this->document = new Document($this->url, true);
}
public function getFullHTML()
{
return $this->document;
}
public function printHTML()
{
$rez = $this->document;
$content = $rez->first($this->contentMarker)->html();
return $content;
}
public function normalizePath()
{
$links = (string) $this->document->find('a[href]');
dd($links);
foreach ($links as $link) {
if (!str_starts_with($link, 'https')) {
$this->document = str_replace($link, 'https://mkgtu.ru' . $link, $this->document);
}
}
return $this->document;
}
}

View File

@ -36,17 +36,23 @@
<div class="col-10"> <div class="col-10">
@php @php
use App\Services\PageScrapper;
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '.content_info');
$content = $pageScrapper->printHTML();
$content = $pageScrapper->normalizePath();
echo $content;
use DiDom\Document; // use DiDom\Document;
//
$document = new Document('https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/', true); // $document = new Document('https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/', true);
//
$posts = $document->find('.content_info'); // $posts = $document->find('.content_info');
//
echo $document->first('.content_info')->html(); // echo $document->first('.content_info')->html();
/* /*
use App\Services\PageScrapper; use App\Services\PageScrapper;
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>'); $pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>');