forked from aslan/applicant-site
56 lines
1.2 KiB
PHP
56 lines
1.2 KiB
PHP
<?php
|
|
|
|
namespace App\Services;
|
|
|
|
class PageScrapper
|
|
{
|
|
private string $url;
|
|
private string $contentMarker;
|
|
|
|
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
|
|
{
|
|
$this->url = $url;
|
|
$this->contentMarker = $contentMarker;
|
|
}
|
|
|
|
public function getHTML()
|
|
{
|
|
|
|
$page = file_get_contents($this->url);
|
|
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";
|
|
|
|
$arr = [];
|
|
$rez = preg_match_all($strForPregMatch, $page, $arr);
|
|
|
|
return $content = $arr[1][0];
|
|
|
|
|
|
}
|
|
|
|
public function normalizeURLFile($content)
|
|
{
|
|
|
|
$rez = preg_match_all('/<a href="(.*)">/isU',$content,$arr);
|
|
$arr[1] = array_unique($arr[1]);
|
|
foreach ($arr[1] as $el) {
|
|
if (!str_starts_with($el, 'https')){
|
|
$content = str_replace($el,'https://mkgtu.ru' . $el,$content);
|
|
}
|
|
}
|
|
return $content;
|
|
|
|
}
|
|
public function cutHTML($content,$strForScissors)
|
|
{
|
|
|
|
|
|
$arr = [];
|
|
$rez = preg_match_all($strForScissors, $content, $arr);
|
|
$content = str_replace($arr[1],'',$content);
|
|
|
|
return $content;
|
|
|
|
|
|
}
|
|
}
|