Cleanup & result capture

This commit is contained in:
Alexis Marie Wright 2022-03-04 20:10:22 -05:00
parent 8a7de04dde
commit 3d580f4838
2 changed files with 105 additions and 61 deletions

86
lib/fetch-pages.php Normal file
View File

@ -0,0 +1,86 @@
<?php
use GuzzleHttp\Client as HTTPClient;
use PHPHtmlParser\Dom as DOM;
function fetchUrl(string $url) {
$client = new HTTPClient();
$res = $client->request(
'GET', $url, [
'http_errors' => false,
'headers' => [
'User-Agent' => Config::userAgent
]
]
);
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
};
return strval($res->getBody());
};
function fetchPages() {
$urls = [Config::baseURL];
$fetchedUrls = [];
$content = [];
while (count($urls) > 0) {
$url = array_shift($urls);
$res = fetchUrl($url);
Log::info("$url: fetched " . strlen($res) . " bytes");
Log::silly($res);
array_push($fetchedUrls, $url);
$dom = new DOM;
$dom->loadStr($res);
// extract page number (if any) from this page's URL
$matches = array(null, null);
if ($url !== Config::baseURL && 0 === preg_match('/comic(\d+)\.php$/', $url, $matches)) {
throw new Exception("failed to find a page number in $url");
};
array_push($content, [
'url' => $url,
'body' => $dom,
'pageNumber' => $matches[1]
]);
// extract next page URL from this page's content
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
break;
};
$nextUrl = Config::baseURL . '/' . $el->href;
if (array_search($nextUrl, $fetchedUrls)) {
Log::debug("$url: already fetched next link $nextUrl");
continue;
};
if (array_search($nextUrl, $urls)) {
Log::debug("$url: duplicate next link $nextUrl");
continue;
};
Log::info("$url: next link $nextUrl");
array_push($urls, $nextUrl);
};
usleep(500000); // wait half a second, to be minimally polite
};
// patch up the page number on the title page
$content[0]['pageNumber'] = $content[1]['pageNumber'] + 1;
return $content;
}

View File

@ -7,71 +7,29 @@ error_reporting(E_ALL & ~E_DEPRECATED);
require_once('vendor/autoload.php');
use GuzzleHttp\Client as HTTPClient;
use PHPHtmlParser\Dom as DOM;
include_once('config/default.php');
include_once('lib/log.php');
include_once('lib/fetch-pages.php');
function fetchUrl(string $url) {
$client = new HTTPClient();
$res = $client->request(
'GET', $url, [
'http_errors' => false,
'headers' => [
'User-Agent' => Config::userAgent
]
]
);
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
};
return strval($res->getBody());
};
$urls = [Config::baseURL];
$fetchedUrls = [];
while (count($urls) > 0) {
$url = array_shift($urls);
// --- this'll be in a lib someday
function parsePageForFeedContent(string $url, DOM $page) {
$imageUrl = Config::baseUrl . '/' . $page->find('div.page img')[0]->src;
$res = fetchUrl($url);
Log::info("$url: fetched " . strlen($res) . " bytes");
Log::silly($res);
array_push($fetchedUrls, $url);
$dom = new DOM;
$dom->loadStr($res);
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
break;
};
$nextUrl = Config::baseURL . '/' . $el->href;
if (array_search($nextUrl, $fetchedUrls)) {
Log::debug("$url: already fetched next link $nextUrl");
continue;
};
if (array_search($nextUrl, $urls)) {
Log::debug("$url: duplicate next link $nextUrl");
continue;
};
Log::info("$url: next link $nextUrl");
array_push($urls, $nextUrl);
};
usleep(500000); // wait half a second, to be minimally polite
return [
title => 'INHUMAN | Page ###',
link => $url,
guid => $url,
description =>
"<a style=\"text-decoration: none; border: none;\" href=\"" . $url . "\">" .
"<img src=\"" . $imageUrl . "\" />" .
"</a>"
];
};
// --- ok back to the real shit
$content = fetchPages();
foreach ($content as $result) {
print "{$result['url']} {$result['pageNumber']}\n";
};