Roman_applicant-site/app/Services/PageScrapper.php

44 lines
1019 B
PHP

<?php
namespace App\Services;
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
{
$this->url = $url;
$this->contentMarker = $contentMarker;
}
public function getHTML()
{
$page = file_get_contents($this->url);
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";
$arr = [];
$rez = preg_match_all($strForPregMatch, $page, $arr);
return $content = $arr[1][0];
}
public function normalizeURLFile($content)
{
//$content = preg_replace('/<a href="(.*)"/isU', 'https://mkgtu.ru' ,$content );
$rez = preg_match_all('/<a href="(.*)">/isU',$content,$arr);
foreach ($arr[1] as $el) {
if (!str_contains($el, 'https://')){
$content = str_replace($el,'https://mkgtu.ru' . $el,$content);
}
}
return $content;
}
}