Archive fetch fixes & cleanup

2022-03-13 22:06:17 -04:00 · 2022-03-13 22:06:17 -04:00 · 90f17c68ca
parent 8d31c7ea29
commit 90f17c68ca
3 changed files with 42 additions and 14 deletions
--- a/config/default.php
+++ b/config/default.php
@ -7,17 +7,20 @@ class Config {
        'description' => '...',
        'feedUrl' => 'http://www.inhuman-comic.com/feed.xml'
    ];
+    
    // where to write the feed file
    const feedPath = '/tmp/feed.xml';
    
-    // base URL of the site
+    // base URL of the site (used for constructing URLs)
    const baseUrl = "http://www.inhuman-comic.com";
-    // archive page URL
+
+    // archive page URL (what we actually read for comic page links)
    const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
    
    // how the script identifies itself to the server while spidering
    // (this will also be used as the "Generator" value in the feed XML)
    const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
+
    // which log levels to emit (true) or suppress (false)
    // nothing below 'info' should be important in normal operation
    const logLevels = [
@ -26,6 +29,4 @@ class Config {
        'debug' => true,
        'silly' => false
    ];
-    // if set, fetch only this many pages while spidering (false to fetch all)
-    const TEST_stopAfter = false;
 }
--- a/lib/fetch-url.php
+++ b/lib/fetch-url.php
@ -20,7 +20,10 @@ function fetchUrl(string $url) {

    if ($res->getStatusCode() !== 200) {
        Log::debug($res->getBody());
-        throw "Request for $url returned {$res->getStatusCode()}";
+        $exc = new Exception("Request for $url returned {$res->getStatusCode()}");
+        $exc->response = $res;
+        
+        throw $exc;
    };

    $dom = new DOM;
--- a/rssmonster.php
+++ b/rssmonster.php
@ -21,27 +21,49 @@ include_once('lib/log.php');


 include_once('lib/fetch-url.php');
-$archivePage = fetchUrl(Config::archiveUrl);

-class ComicPageLink {
+class ComicPage {
    public string $href = '';
    public string $pageNo = '';
    public string $title = '';
+    public string $imageUrl = '';
+    
    function __construct() {}
    function __toString() {
-        return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
+        return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
    }
 };

-$comicPageLinks = array();
+$comicPages = array();
+
+$archivePage = fetchUrl(Config::archiveUrl);

 foreach ($archivePage->find('div.textcontent ul') as $arc) {
    $arcHtml = $arc->innerHTML;
    
    $comicLinks = $arc->find('a');
    foreach ($comicLinks as $el) {
-        $link = new ComicPageLink;
+        $link = new ComicPage;
        
+        $link->href = Config::baseUrl . "/" . $el->href;
+        $link->pageNo = $el->innerText;
+
+        // we'll get the image URL from this later, but fetch it early
+        // in order to check that the page exists; if not, we'll
+        // exclude it from the feed
+        try {
+            $comicPage = fetchUrl($link->href);
+        } catch (Exception $exc) {
+            if ($exc->response->getStatusCode() === 404) {
+                Log::warn("Got HTTP 404 from {$link->href}; skipping");
+                continue;
+            };
+            // rethrow unrecognized error
+            throw $exc;
+        };
+
+        // "why not a regex?" well buddy i don't know if you've ever
+        // seen regexes but string walking is kinder to the brainmeats
        $start = strpos($arcHtml, $el->href);
        do {
            $start += 1;
@ -60,11 +82,13 @@ foreach ($archivePage->find('div.textcontent ul') as $arc) {
        };
        
        $link->title = trim(substr($arcHtml, $start, $end - $start));
-
-        $link->href = Config::baseUrl . "/" . $el->href;
-        $link->pageNo = $el->innerText;
        
-        print $link;
+        $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
+
+        array_push($comicPages, $link);
+        Log::debug($link);
+
+        usleep(250000); // don't be too rude
    };
 };