Archive fetch fixes & cleanup
This commit is contained in:
parent
8d31c7ea29
commit
90f17c68ca
|
@ -7,17 +7,20 @@ class Config {
|
|||
'description' => '...',
|
||||
'feedUrl' => 'http://www.inhuman-comic.com/feed.xml'
|
||||
];
|
||||
|
||||
// where to write the feed file
|
||||
const feedPath = '/tmp/feed.xml';
|
||||
|
||||
// base URL of the site
|
||||
// base URL of the site (used for constructing URLs)
|
||||
const baseUrl = "http://www.inhuman-comic.com";
|
||||
// archive page URL
|
||||
|
||||
// archive page URL (what we actually read for comic page links)
|
||||
const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
|
||||
|
||||
// how the script identifies itself to the server while spidering
|
||||
// (this will also be used as the "Generator" value in the feed XML)
|
||||
const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
|
||||
|
||||
// which log levels to emit (true) or suppress (false)
|
||||
// nothing below 'info' should be important in normal operation
|
||||
const logLevels = [
|
||||
|
@ -26,6 +29,4 @@ class Config {
|
|||
'debug' => true,
|
||||
'silly' => false
|
||||
];
|
||||
// if set, fetch only this many pages while spidering (false to fetch all)
|
||||
const TEST_stopAfter = false;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,10 @@ function fetchUrl(string $url) {
|
|||
|
||||
if ($res->getStatusCode() !== 200) {
|
||||
Log::debug($res->getBody());
|
||||
throw "Request for $url returned {$res->getStatusCode()}";
|
||||
$exc = new Exception("Request for $url returned {$res->getStatusCode()}");
|
||||
$exc->response = $res;
|
||||
|
||||
throw $exc;
|
||||
};
|
||||
|
||||
$dom = new DOM;
|
||||
|
|
|
@ -21,27 +21,49 @@ include_once('lib/log.php');
|
|||
|
||||
|
||||
include_once('lib/fetch-url.php');
|
||||
$archivePage = fetchUrl(Config::archiveUrl);
|
||||
|
||||
class ComicPageLink {
|
||||
class ComicPage {
|
||||
public string $href = '';
|
||||
public string $pageNo = '';
|
||||
public string $title = '';
|
||||
public string $imageUrl = '';
|
||||
|
||||
function __construct() {}
|
||||
function __toString() {
|
||||
return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
|
||||
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
|
||||
}
|
||||
};
|
||||
|
||||
$comicPageLinks = array();
|
||||
$comicPages = array();
|
||||
|
||||
$archivePage = fetchUrl(Config::archiveUrl);
|
||||
|
||||
foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
||||
$arcHtml = $arc->innerHTML;
|
||||
|
||||
$comicLinks = $arc->find('a');
|
||||
foreach ($comicLinks as $el) {
|
||||
$link = new ComicPageLink;
|
||||
$link = new ComicPage;
|
||||
|
||||
$link->href = Config::baseUrl . "/" . $el->href;
|
||||
$link->pageNo = $el->innerText;
|
||||
|
||||
// we'll get the image URL from this later, but fetch it early
|
||||
// in order to check that the page exists; if not, we'll
|
||||
// exclude it from the feed
|
||||
try {
|
||||
$comicPage = fetchUrl($link->href);
|
||||
} catch (Exception $exc) {
|
||||
if ($exc->response->getStatusCode() === 404) {
|
||||
Log::warn("Got HTTP 404 from {$link->href}; skipping");
|
||||
continue;
|
||||
};
|
||||
// rethrow unrecognized error
|
||||
throw $exc;
|
||||
};
|
||||
|
||||
// "why not a regex?" well buddy i don't know if you've ever
|
||||
// seen regexes but string walking is kinder to the brainmeats
|
||||
$start = strpos($arcHtml, $el->href);
|
||||
do {
|
||||
$start += 1;
|
||||
|
@ -60,11 +82,13 @@ foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
|||
};
|
||||
|
||||
$link->title = trim(substr($arcHtml, $start, $end - $start));
|
||||
|
||||
$link->href = Config::baseUrl . "/" . $el->href;
|
||||
$link->pageNo = $el->innerText;
|
||||
|
||||
print $link;
|
||||
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
|
||||
|
||||
array_push($comicPages, $link);
|
||||
Log::debug($link);
|
||||
|
||||
usleep(250000); // don't be too rude
|
||||
};
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue