forked from aslan/applicant-site
new-scrapper #1
|
@ -1,3 +1,4 @@
|
||||||
|
/public/img/icons/play-button.png
|
||||||
/.phpunit.cache
|
/.phpunit.cache
|
||||||
/node_modules
|
/node_modules
|
||||||
/public/build
|
/public/build
|
||||||
|
@ -21,3 +22,4 @@ yarn-error.log
|
||||||
/.phpstorm.meta.php
|
/.phpstorm.meta.php
|
||||||
/_ide_helper.php
|
/_ide_helper.php
|
||||||
/_ide_helper_models.php
|
/_ide_helper_models.php
|
||||||
|
/public/img/icons/play-button.png
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
|
|
||||||
namespace App\Services;
|
namespace App\Services;
|
||||||
|
|
||||||
|
use DiDom\Document;
|
||||||
|
|
||||||
|
/*
|
||||||
class PageScrapper
|
class PageScrapper
|
||||||
{
|
{
|
||||||
private string $url;
|
private string $url;
|
||||||
|
@ -22,7 +25,7 @@ class PageScrapper
|
||||||
$arr = [];
|
$arr = [];
|
||||||
$rez = preg_match_all($strForPregMatch, $page, $arr);
|
$rez = preg_match_all($strForPregMatch, $page, $arr);
|
||||||
|
|
||||||
return $content = $arr[1][0];
|
return $content = $arr[0][0];
|
||||||
}
|
}
|
||||||
|
|
||||||
public function normalizeURLFile($content)
|
public function normalizeURLFile($content)
|
||||||
|
@ -60,3 +63,39 @@ class PageScrapper
|
||||||
return $content;
|
return $content;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
class PageScrapper
|
||||||
|
{
|
||||||
|
private string $url;
|
||||||
|
private string $contentMarker;
|
||||||
|
|
||||||
|
public function __construct($url, $contentMarker)
|
||||||
|
{
|
||||||
|
$this->url = $url;
|
||||||
|
$this->contentMarker = $contentMarker;
|
||||||
|
$this->document = new Document($this->url, true);
|
||||||
|
}
|
||||||
|
public function getFullHTML()
|
||||||
|
{
|
||||||
|
return $this->document;
|
||||||
|
}
|
||||||
|
public function printHTML()
|
||||||
|
{
|
||||||
|
$rez = $this->document;
|
||||||
|
$content = $rez->first($this->contentMarker)->html();
|
||||||
|
return $content;
|
||||||
|
}
|
||||||
|
public function normalizePath()
|
||||||
|
{
|
||||||
|
$links = (string) $this->document->find('a[href]');
|
||||||
|
dd($links);
|
||||||
|
foreach ($links as $link) {
|
||||||
|
if (!str_starts_with($link, 'https')) {
|
||||||
|
$this->document = str_replace($link, 'https://mkgtu.ru' . $link, $this->document);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->document;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -8,8 +8,9 @@
|
||||||
],
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"require": {
|
"require": {
|
||||||
"php": "^8.2",
|
"php": "^8.1|8.2",
|
||||||
"guzzlehttp/guzzle": "^7.8.1",
|
"guzzlehttp/guzzle": "^7.8.1",
|
||||||
|
"imangazaliev/didom": "^2.0",
|
||||||
"laravel/framework": "^10.41.0",
|
"laravel/framework": "^10.41.0",
|
||||||
"laravel/sanctum": "^3.3.3",
|
"laravel/sanctum": "^3.3.3",
|
||||||
"laravel/tinker": "^2.9.0",
|
"laravel/tinker": "^2.9.0",
|
||||||
|
@ -64,6 +65,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"config": {
|
"config": {
|
||||||
|
"platform-check": false,
|
||||||
"optimize-autoloader": true,
|
"optimize-autoloader": true,
|
||||||
"preferred-install": "dist",
|
"preferred-install": "dist",
|
||||||
"sort-packages": true,
|
"sort-packages": true,
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
@extends('layouts.applicant-layout')
|
@extends('layouts.applicant-layout')
|
||||||
|
|
||||||
@section('content')
|
@section('content')
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
@ -36,12 +37,31 @@
|
||||||
<div class="col-10">
|
<div class="col-10">
|
||||||
@php
|
@php
|
||||||
use App\Services\PageScrapper;
|
use App\Services\PageScrapper;
|
||||||
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>');
|
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '.content_info');
|
||||||
$row = $pageScrapper->getHTML();
|
$content = $pageScrapper->printHTML();
|
||||||
$content = $pageScrapper->normalizeURLFile($row);
|
$content = $pageScrapper->normalizePath();
|
||||||
$content = $pageScrapper->cutHTML($content,'/<footer(.*)<\/footer>/isU');
|
|
||||||
echo $content;
|
echo $content;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// use DiDom\Document;
|
||||||
|
//
|
||||||
|
// $document = new Document('https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/', true);
|
||||||
|
//
|
||||||
|
// $posts = $document->find('.content_info');
|
||||||
|
//
|
||||||
|
// echo $document->first('.content_info')->html();
|
||||||
|
/*
|
||||||
|
use App\Services\PageScrapper;
|
||||||
|
$pageScrapper = new PageScrapper("https://mkgtu.ru/postuplenie/podat-dokumenty-onlayn/", '<div class=["\']content_info["\']>');
|
||||||
|
$row = $pageScrapper->getHTML();
|
||||||
|
$content = $pageScrapper->normalizeURLFile($row);
|
||||||
|
$content = $pageScrapper->cutHTML($content,'/<footer(.*)<\/footer>/isU');
|
||||||
|
echo $content;
|
||||||
|
*/
|
||||||
|
|
||||||
@endphp
|
@endphp
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -28,13 +28,14 @@
|
||||||
$row3 = $pageScrapper3->cutHTML($row3,'/<footer(.*)<\/footer>/isU');
|
$row3 = $pageScrapper3->cutHTML($row3,'/<footer(.*)<\/footer>/isU');
|
||||||
|
|
||||||
$content = $row . $row2 . $row3;
|
$content = $row . $row2 . $row3;
|
||||||
|
|
||||||
//dd($content);
|
//dd($content);
|
||||||
|
|
||||||
|
|
||||||
//$content = $pageScrapper->cutHTML($row,'/<footer(.*)<\/footer>/isU');
|
//$content = $pageScrapper->cutHTML($row,'/<footer(.*)<\/footer>/isU');
|
||||||
// $content = $pageScrapper->normalizeURLFile($content);
|
// $content = $pageScrapper->normalizeURLFile($content);
|
||||||
|
|
||||||
echo htmlspecialchars($content);
|
echo $content;
|
||||||
|
|
||||||
@endphp
|
@endphp
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue