new-scrapper #1

Open
RomanGolienko wants to merge 2 commits from new-scrapper into main
5 changed files with 71 additions and 7 deletions

2
.gitignore vendored
View File

@ -1,3 +1,4 @@
/public/img/icons/play-button.png
/.phpunit.cache /.phpunit.cache
/node_modules /node_modules
/public/build /public/build
@ -21,3 +22,4 @@ yarn-error.log
/.phpstorm.meta.php /.phpstorm.meta.php
/_ide_helper.php /_ide_helper.php
/_ide_helper_models.php /_ide_helper_models.php
/public/img/icons/play-button.png

View File

@ -2,6 +2,9 @@
namespace App\Services; namespace App\Services;
use DiDom\Document;
/*
class PageScrapper class PageScrapper
{ {
private string $url; private string $url;
@ -22,7 +25,7 @@ class PageScrapper
$arr = []; $arr = [];
$rez = preg_match_all($strForPregMatch, $page, $arr); $rez = preg_match_all($strForPregMatch, $page, $arr);
return $content = $arr[1][0]; return $content = $arr[0][0];
} }
public function normalizeURLFile($content) public function normalizeURLFile($content)
@ -60,3 +63,39 @@ class PageScrapper
return $content; return $content;
} }
} }
*/
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker)
{
$this->url = $url;
$this->contentMarker = $contentMarker;
$this->document = new Document($this->url, true);
}
public function getFullHTML()
{
return $this->document;
}
public function printHTML()
{
$rez = $this->document;
$content = $rez->first($this->contentMarker)->html();
return $content;
}
public function normalizePath()
{
$links = (string) $this->document->find('a[href]');
dd($links);
foreach ($links as $link) {
if (!str_starts_with($link, 'https')) {
$this->document = str_replace($link, 'https://mkgtu.ru' . $link, $this->document);
}
}
return $this->document;
}
}

View File

@ -8,8 +8,9 @@
], ],
"license": "MIT", "license": "MIT",
"require": { "require": {
"php": "^8.2", "php": "^8.1|8.2",
"guzzlehttp/guzzle": "^7.8.1", "guzzlehttp/guzzle": "^7.8.1",
"imangazaliev/didom": "^2.0",
"laravel/framework": "^10.41.0", "laravel/framework": "^10.41.0",
"laravel/sanctum": "^3.3.3", "laravel/sanctum": "^3.3.3",
"laravel/tinker": "^2.9.0", "laravel/tinker": "^2.9.0",
@ -64,6 +65,7 @@
} }
}, },
"config": { "config": {
"platform-check": false,
"optimize-autoloader": true, "optimize-autoloader": true,
"preferred-install": "dist", "preferred-install": "dist",
"sort-packages": true, "sort-packages": true,

View File

@ -1,4 +1,5 @@
@extends('layouts.applicant-layout') @extends('layouts.applicant-layout')
@section('content') @section('content')
<style> <style>
@ -36,12 +37,31 @@
<div class="col-10"> <div class="col-10">
@php @php
use App\Services\PageScrapper; use App\Services\PageScrapper;
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>'); $pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '.content_info');
$row = $pageScrapper->getHTML(); $content = $pageScrapper->printHTML();
$content = $pageScrapper->normalizeURLFile($row); $content = $pageScrapper->normalizePath();
$content = $pageScrapper->cutHTML($content,'/<footer(.*)<\/footer>/isU');
echo $content; echo $content;
// use DiDom\Document;
//
// $document = new Document('https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/', true);
//
// $posts = $document->find('.content_info');
//
// echo $document->first('.content_info')->html();
/*
use App\Services\PageScrapper;
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>');
$row = $pageScrapper->getHTML();
$content = $pageScrapper->normalizeURLFile($row);
$content = $pageScrapper->cutHTML($content,'/<footer(.*)<\/footer>/isU');
echo $content;
*/
@endphp @endphp
</div> </div>

View File

@ -28,13 +28,14 @@
$row3 = $pageScrapper3->cutHTML($row3,'/<footer(.*)<\/footer>/isU'); $row3 = $pageScrapper3->cutHTML($row3,'/<footer(.*)<\/footer>/isU');
$content = $row . $row2 . $row3; $content = $row . $row2 . $row3;
//dd($content); //dd($content);
//$content = $pageScrapper->cutHTML($row,'/<footer(.*)<\/footer>/isU'); //$content = $pageScrapper->cutHTML($row,'/<footer(.*)<\/footer>/isU');
// $content = $pageScrapper->normalizeURLFile($content); // $content = $pageScrapper->normalizeURLFile($content);
echo htmlspecialchars($content); echo $content;
@endphp @endphp