Roman_applicant-site/app/Services/PageScrapper.php

63 lines
1.8 KiB
PHP
Raw Normal View History

2024-01-22 17:08:45 +03:00
<?php
namespace App\Services;
class PageScrapper
{
private string $url;
private string $contentMarker;
public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
{
$this->url = $url;
$this->contentMarker = $contentMarker;
}
public function getHTML()
{
2024-01-23 15:45:52 +03:00
2024-01-22 17:08:45 +03:00
$page = file_get_contents($this->url);
2024-01-23 15:45:52 +03:00
$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";
2024-01-22 17:08:45 +03:00
$arr = [];
$rez = preg_match_all($strForPregMatch, $page, $arr);
2024-01-23 15:45:52 +03:00
2024-01-26 16:08:54 +03:00
return $content = $arr[0][0];
2024-01-22 17:08:45 +03:00
}
public function normalizeURLFile($content)
{
2024-01-25 08:59:34 +03:00
$rez = preg_match_all('/<a href="(.*)">/isU', $content, $arr);
$arr[1] = array_unique($arr[1]);
2024-01-22 17:08:45 +03:00
foreach ($arr[1] as $el) {
2024-01-25 08:59:34 +03:00
if (!str_starts_with($el, 'https')) {
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
2024-01-22 17:08:45 +03:00
}
}
2024-01-25 12:57:38 +03:00
$rez = preg_match_all('/src="(.*)">/isU', $content, $arr);
$arr[1] = array_unique($arr[1]);
foreach ($arr[1] as $el) {
if (!str_starts_with($el, 'https') && str_contains($el, 'upload')) {
$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
}
}
2024-01-24 15:23:23 +03:00
2024-01-22 17:08:45 +03:00
return $content;
}
2024-01-25 08:59:34 +03:00
public function cutHTML($content, $strForScissors)
{
$arr = [];
2024-01-25 14:01:05 +03:00
//<footer(.*)<\/footer>
//safdsaf sdfdasf<footer>--------------------------------fsdfdasf <\/footer> asdfdasf asdf
$rez = preg_match_all($strForScissors, $content, $arr);
2024-01-25 14:01:05 +03:00
//$arr[1][0] = '>--------------------------------fsdfdasf ';
$content = str_replace($arr[0], '', $content);
//safdsaf sdfdasf<footer<\/footer> asdfdasf asdf
return $content;
2024-01-22 17:08:45 +03:00
}
}