rssmonster/rssmonster.php

100 lines
2.4 KiB
PHP

<?php
declare(strict_types = 1);
// Some dependencies haven't been updated for PHP 8
// This can probably be removed for PHP 7.3
error_reporting(E_ALL & ~E_DEPRECATED);
require('vendor/autoload.php');
include('config/default.php');
use RSSMonster\Config as Config;
use GuzzleHttp\Client as HTTPClient;
use PHPHtmlParser\Dom as DOM;
class Log {
private static $enabled = Config::logLevels;
private static function emit(string $level, $message) {
if (!Log::$enabled[$level]) {
return;
};
$now = date("c");
print("$now [$level] $message\n");
}
public static function silly($message) {
Log::emit('silly', $message);
}
public static function debug($message) {
Log::emit('debug', $message);
}
public static function info($message) {
Log::emit('info', $message);
}
};
function fetchUrl(string $url) {
$client = new HTTPClient();
$res = $client->request(
'GET', $url, [
'http_errors' => false,
'headers' => [
'User-Agent' => Config::userAgent
]
]
);
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
};
return strval($res->getBody());
};
$urls = [Config::baseURL];
$fetchedUrls = [];
while (count($urls) > 0) {
$url = array_shift($urls);
$res = fetchUrl($url);
Log::info("$url: fetched " . strlen($res) . " bytes");
Log::silly($res);
array_push($fetchedUrls, $url);
$dom = new DOM;
$dom->loadStr($res);
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
break;
};
$nextUrl = Config::baseURL . '/' . $el->href;
if (array_search($nextUrl, $fetchedUrls)) {
Log::debug("$url: already fetched next link $nextUrl");
continue;
};
if (array_search($nextUrl, $urls)) {
Log::debug("$url: duplicate next link $nextUrl");
continue;
};
Log::info("$url: next link $nextUrl");
array_push($urls, $nextUrl);
};
usleep(500000); // wait half a second, to be minimally polite
};