Archive fetch fixes & cleanup

This commit is contained in:
Alexis Marie Wright 2022-03-13 22:06:17 -04:00
parent 8d31c7ea29
commit 90f17c68ca
3 changed files with 42 additions and 14 deletions

View File

@ -7,17 +7,20 @@ class Config {
'description' => '...',
'feedUrl' => 'http://www.inhuman-comic.com/feed.xml'
];
// where to write the feed file
const feedPath = '/tmp/feed.xml';
// base URL of the site
// base URL of the site (used for constructing URLs)
const baseUrl = "http://www.inhuman-comic.com";
// archive page URL
// archive page URL (what we actually read for comic page links)
const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
// how the script identifies itself to the server while spidering
// (this will also be used as the "Generator" value in the feed XML)
const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
// which log levels to emit (true) or suppress (false)
// nothing below 'info' should be important in normal operation
const logLevels = [
@ -26,6 +29,4 @@ class Config {
'debug' => true,
'silly' => false
];
// if set, fetch only this many pages while spidering (false to fetch all)
const TEST_stopAfter = false;
}

View File

@ -20,7 +20,10 @@ function fetchUrl(string $url) {
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
$exc = new Exception("Request for $url returned {$res->getStatusCode()}");
$exc->response = $res;
throw $exc;
};
$dom = new DOM;

View File

@ -21,27 +21,49 @@ include_once('lib/log.php');
include_once('lib/fetch-url.php');
$archivePage = fetchUrl(Config::archiveUrl);
class ComicPageLink {
class ComicPage {
public string $href = '';
public string $pageNo = '';
public string $title = '';
public string $imageUrl = '';
function __construct() {}
function __toString() {
return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
}
};
$comicPageLinks = array();
$comicPages = array();
$archivePage = fetchUrl(Config::archiveUrl);
foreach ($archivePage->find('div.textcontent ul') as $arc) {
$arcHtml = $arc->innerHTML;
$comicLinks = $arc->find('a');
foreach ($comicLinks as $el) {
$link = new ComicPageLink;
$link = new ComicPage;
$link->href = Config::baseUrl . "/" . $el->href;
$link->pageNo = $el->innerText;
// we'll get the image URL from this later, but fetch it early
// in order to check that the page exists; if not, we'll
// exclude it from the feed
try {
$comicPage = fetchUrl($link->href);
} catch (Exception $exc) {
if ($exc->response->getStatusCode() === 404) {
Log::warn("Got HTTP 404 from {$link->href}; skipping");
continue;
};
// rethrow unrecognized error
throw $exc;
};
// "why not a regex?" well buddy i don't know if you've ever
// seen regexes but string walking is kinder to the brainmeats
$start = strpos($arcHtml, $el->href);
do {
$start += 1;
@ -60,11 +82,13 @@ foreach ($archivePage->find('div.textcontent ul') as $arc) {
};
$link->title = trim(substr($arcHtml, $start, $end - $start));
$link->href = Config::baseUrl . "/" . $el->href;
$link->pageNo = $el->innerText;
print $link;
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
array_push($comicPages, $link);
Log::debug($link);
usleep(250000); // don't be too rude
};
};