Roman_applicant-site/app/Services/PageScrapper.php

<?php

namespace App\Services;

use DiDom\Document;

/*
class PageScrapper
{
    private string $url;
    private string $contentMarker;

    public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')
    {
        $this->url = $url;
        $this->contentMarker = $contentMarker;
    }

    public function getHTML()
    {

        $page = file_get_contents($this->url);
        $strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";

        $arr = [];
        $rez = preg_match_all($strForPregMatch, $page, $arr);

        return $content = $arr[0][0];
    }

    public function normalizeURLFile($content)
    {

        $rez = preg_match_all('/<a href="(.*)">/isU', $content, $arr);
        $arr[1] = array_unique($arr[1]);
        foreach ($arr[1] as $el) {
            if (!str_starts_with($el, 'https')) {
                $content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
            }
        }
        $rez = preg_match_all('/src="(.*)">/isU', $content, $arr);
        $arr[1] = array_unique($arr[1]);
        foreach ($arr[1] as $el) {
            if (!str_starts_with($el, 'https') && str_contains($el, 'upload'))  {
                $content = str_replace($el, 'https://mkgtu.ru' . $el, $content);
            }
        }

        return $content;
    }
    public function cutHTML($content, $strForScissors)
    {


        $arr = [];
        //<footer(.*)<\/footer>
        //safdsaf sdfdasf<footer>--------------------------------fsdfdasf <\/footer> asdfdasf asdf
        $rez = preg_match_all($strForScissors, $content, $arr);
        //$arr[1][0] = '>--------------------------------fsdfdasf ';
        $content = str_replace($arr[0], '', $content);
        //safdsaf sdfdasf<footer<\/footer> asdfdasf asdf

        return $content;
    }
}
*/

class PageScrapper
{
    private string $url;
    private string $contentMarker;

    public function __construct($url, $contentMarker)
    {
        $this->url = $url;
        $this->contentMarker = $contentMarker;
        $this->document = new Document($this->url, true);
    }
    public function getFullHTML()
    {
        return $this->document;
    }
    public function printHTML()
    {
        $rez = $this->document;
        $content = $rez->first($this->contentMarker)->html();
        return $content;
    }
    public function normalizePath()
    {
        $links = (string) $this->document->find('a[href]');
        dd($links);
        foreach ($links as $link) {
            if (!str_starts_with($link, 'https')) {
                $this->document = str_replace($link, 'https://mkgtu.ru' . $link, $this->document);
            }
        }

        return $this->document;
    }
}
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`<?php`

			`namespace App\Services;`

newScrapper 2024-01-26 14:45:33 +03:00			`use DiDom\Document;`

			`/*`
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`class PageScrapper`
			`{`
			`private string $url;`
			`private string $contentMarker;`

			`public function __construct($url, $contentMarker = '<div class=["\']content_info["\']>')`
			`{`
			`$this->url = $url;`
			`$this->contentMarker = $contentMarker;`
			`}`

			`public function getHTML()`
			`{`
adding new page 2024-01-23 15:45:52 +03:00
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`$page = file_get_contents($this->url);`
adding new page 2024-01-23 15:45:52 +03:00			`$strForPregMatch = "/" . "{$this->contentMarker}" . "(.*)<\/div>/is";`

adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`$arr = [];`
			`$rez = preg_match_all($strForPregMatch, $page, $arr);`
adding new page 2024-01-23 15:45:52 +03:00
actual version, device change 2024-01-26 16:08:54 +03:00			`return $content = $arr[0][0];`
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`}`

			`public function normalizeURLFile($content)`
			`{`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00
lint fix 2024-01-25 08:59:34 +03:00			`$rez = preg_match_all('/<a href="(.*)">/isU', $content, $arr);`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00			`$arr[1] = array_unique($arr[1]);`
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`foreach ($arr[1] as $el) {`
lint fix 2024-01-25 08:59:34 +03:00			`if (!str_starts_with($el, 'https')) {`
			`$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);`
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`}`
			`}`
completing all pages in categories 2024-01-25 12:57:38 +03:00			`$rez = preg_match_all('/src="(.*)">/isU', $content, $arr);`
			`$arr[1] = array_unique($arr[1]);`
			`foreach ($arr[1] as $el) {`
			`if (!str_starts_with($el, 'https') && str_contains($el, 'upload')) {`
			`$content = str_replace($el, 'https://mkgtu.ru' . $el, $content);`
			`}`
			`}`
new pages, renaming routes 2024-01-24 15:23:23 +03:00
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`return $content;`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00			`}`
lint fix 2024-01-25 08:59:34 +03:00			`public function cutHTML($content, $strForScissors)`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00			`{`


			`$arr = [];`
fixing 2024-01-25 14:01:05 +03:00			`//<footer(.*)<\/footer>`
			`//safdsaf sdfdasf<footer>--------------------------------fsdfdasf <\/footer> asdfdasf asdf`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00			`$rez = preg_match_all($strForScissors, $content, $arr);`
fixing 2024-01-25 14:01:05 +03:00			`//$arr[1][0] = '>--------------------------------fsdfdasf ';`
			`$content = str_replace($arr[0], '', $content);`
			`//safdsaf sdfdasf<footer<\/footer> asdfdasf asdf`
adding new method to PageScrapper, adding new page 2024-01-24 13:30:27 +03:00
			`return $content;`
adding new service PageScrapper.php 2024-01-22 17:08:45 +03:00			`}`
			`}`
newScrapper 2024-01-26 14:45:33 +03:00			`*/`

			`class PageScrapper`
			`{`
			`private string $url;`
			`private string $contentMarker;`

			`public function __construct($url, $contentMarker)`
			`{`
			`$this->url = $url;`
			`$this->contentMarker = $contentMarker;`
			`$this->document = new Document($this->url, true);`
			`}`
			`public function getFullHTML()`
			`{`
			`return $this->document;`
			`}`
			`public function printHTML()`
			`{`
			`$rez = $this->document;`
			`$content = $rez->first($this->contentMarker)->html();`
			`return $content;`
			`}`
			`public function normalizePath()`
			`{`
			`$links = (string) $this->document->find('a[href]');`
			`dd($links);`
			`foreach ($links as $link) {`
			`if (!str_starts_with($link, 'https')) {`
			`$this->document = str_replace($link, 'https://mkgtu.ru' . $link, $this->document);`
			`}`
			`}`

			`return $this->document;`
			`}`
			`}`