applicant-site/app/Services/PageScrapper.php

60 lines
1.6 KiB
PHP
Raw Normal View History

2024-01-22 17:08:45 +03:00
<?php
namespace App\Services;
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
{
$this->url = $url;
$this->contentMarker = $contentMarker;
}
public function getHTML()
{
// echo($this->url);
// echo($this->contentMarker);
$page = file_get_contents($this->url);
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/isU";
$arr = [];
$rez = preg_match_all($strForPregMatch, $page, $arr);
//echo '<prE>'.print_r($arr[1]).'</pre>';
//$tmp = explode("<br>" , $page);
//$content = $tmp[4].$tmp[5].$tmp[6];
return $content = $arr[1][0].'</div>';
}
public function normalizeURLFile($content)
{
$rez = preg_match_all('/href="(.*)"/isU',$content,$arr);
//echo '<prE>'.print_r($arr,1).'</pre>';
foreach ($arr[1] as $el) {
//if (strpos($el, '/abitur') !== false) {
if (strpos($el, 'http') !== 0) {
//str_replace('/abitur', 'https://mkgtu.ru/abitur',$el);
$content = str_replace($el,'https://mkgtu.ru'.$el,$content);
}
}
//$content = $arr[1][0].'</div>';
//$content = preg_replace('/href="(?!http|#)/is', 'https:\/\/mkgtu.ru', $content, -1, $count);
//$content = str_replace('href="','href="https://mkgtu.ru',$content);
//$pos = strripos($content,'/abitur');
//echo substr_replace($content, 'https://mkgtu.ru',$pos,0)
return $content;
}
}