Write RSS feed (correctly this time lol)

2022-03-14 01:26:41 -04:00 · 2022-03-14 01:26:41 -04:00 · 32eab358a6
parent 90f17c68ca
commit 32eab358a6
8 changed files with 114 additions and 248 deletions
--- a/lib/fetch-pages.php
+++ b/lib/fetch-pages.php
@ -1,139 +0,0 @@
-<?php
-declare(strict_types = 1);
-
-include_once('config/default.php');
-
-use GuzzleHttp\Client as HTTPClient;
-use PHPHtmlParser\Dom as DOM;
-
-function fetchUrl(string $url) {
-    $client = new HTTPClient();
-
-    $res = $client->request(
-        'GET', $url, [
-            'http_errors' => false,
-            'headers' => [
-                'User-Agent' => Config::userAgent
-            ]
-        ]
-    );
-
-    if ($res->getStatusCode() !== 200) {
-        Log::debug($res->getBody());
-        throw "Request for $url returned {$res->getStatusCode()}";
-    };
-
-    return strval($res->getBody());
-};
-
-function fetchPages() {
-    $urls = [Config::baseURL];
-    $fetchedUrls = [];
-    $content = [];
-
-    while (count($urls) > 0) {
-        $url = array_shift($urls);
-    
-        $res = fetchUrl($url);
-        Log::info("$url: fetched " . strlen($res) . " bytes");
-
-        Log::silly($res);
-    
-        array_push($fetchedUrls, $url);
-
-        $dom = new DOM;
-        $dom->loadStr($res);
-
-        $pageNumber = null;
-
-        // extract page number (if any) from this page's URL
-        if ($url === Config::baseURL) {
-            Log::debug("$url is the root; will patch up page number later");
-        } else {
-            $matches = array();
-        
-            if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
-                $pageNumber = $matches[1];
-            } else {
-                Log::debug("failed to find a page number in $url, trying special cases");
-
-                if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
-                    $pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
-                }
-                elseif (preg_match('/cover13/', $url)) {
-                    $pageNumber = 'Arc 13: Front Cover';
-                }
-                elseif (preg_match('/back12/', $url)) {
-                    $pageNumber = 'Arc 12: Back Cover';
-                }
-                elseif (preg_match('/arc12splash/', $url)) {
-                    $pageNumber = 'Arc 12: Front Cover';
-                }
-                elseif (preg_match('/arc11/', $url)) {
-                    $pageNumber = 'Arc 11: Front Cover';
-                }
-                elseif (preg_match('/back10/', $url)) {
-                    $pageNumber = 'Arc 10: Back Cover';
-                }
-                elseif (preg_match('/splash10/', $url)) {
-                    $pageNumber = 'Arc 10: Cover';
-                }
-                elseif (preg_match('/arc9/', $url)) {
-                    $pageNumber = 'Arc 9: Cover';
-                }
-                elseif (preg_match('/arc8/', $url)) {
-                    $pageNumber = 'Arc 8: Cover';
-                };
-            };
-
-            if ($pageNumber === null) {
-                throw new Exception("failed to find a page number in $url");
-            };
-        };
-
-        array_push($content, [
-            'url' => $url,
-            'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
-            'body' => $dom,
-            'pageNumber' => $pageNumber
-        ]);
-
-        // extract next page URL from this page's content
-        foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
-            if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
-                Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
-                break;
-            };
-        
-            $nextUrl = Config::baseURL . '/' . $el->href;
-        
-            if (array_search($nextUrl, $fetchedUrls)) {
-                Log::debug("$url: already fetched next link $nextUrl");
-                continue;
-            };
-
-            if (array_search($nextUrl, $urls)) {
-                Log::debug("$url: duplicate next link $nextUrl");
-                continue;
-            };
-        
-            Log::info("$url: next link $nextUrl");
-            array_push($urls, $nextUrl);
-        };
-
-        usleep(500000); // wait half a second, to be minimally polite
-    };
-
-    // patch up the page number on the title page by adding 1 to the
-    // first numeric page number we find, starting from the newest
-    // non-root page
-    foreach (array_slice($content, 1) as $page) {
-        if (preg_match('/^\d+$/', $page['pageNumber'])) {
-            $content[0]['pageNumber'] = $page['pageNumber'] + 1;
-            Log::info("patched up root page number to " . $content[0]['pageNumber']);
-            break;
-        };
-    };
-
-    return $content;
-}
--- a/lib/fetch-site-content.php
+++ b/lib/fetch-site-content.php
@ -0,0 +1,82 @@
+<?php
+declare(strict_types = 1);
+
+include_once('config/default.php');
+
+include_once('lib/log.php');
+include_once('lib/fetch-url.php');
+
+class ComicPage {
+    public string $href = '';
+    public string $pageNo = '';
+    public string $title = '';
+    public string $imageUrl = '';
+    
+    function __construct() {}
+    function __toString() {
+        return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
+    }
+};
+
+function fetchSiteContent() {
+    $comicPages = array();
+
+    $archivePage = fetchUrl(Config::archiveUrl);
+
+    foreach ($archivePage->find('div.textcontent ul') as $arc) {
+        $arcHtml = $arc->innerHTML;
+
+        $comicLinks = $arc->find('a');
+        foreach ($comicLinks as $el) {
+            $link = new ComicPage;
+        
+            $link->href = Config::baseUrl . "/" . $el->href;
+            $link->pageNo = $el->innerText;
+
+            // we'll get the image URL from this later, but fetch it early
+            // in order to check that the page exists; if not, we'll
+            // exclude it from the feed
+            try {
+                $comicPage = fetchUrl($link->href);
+            } catch (Exception $exc) {
+                if ($exc->response->getStatusCode() === 404) {
+                    Log::warn("Got HTTP 404 from {$link->href}; skipping");
+                    continue;
+                };
+                // rethrow unrecognized error
+                throw $exc;
+            };
+
+            // "why not a regex?" well buddy i don't know if you've ever
+            // seen regexes but string walking is kinder to the brainmeats
+            $start = strpos($arcHtml, $el->href);
+            do {
+                $start += 1;
+            } while (substr($arcHtml, $start, 4) != '</a>');
+
+            $start += 4; // '</a>'
+            $start += 4; // ' // '
+
+            $end = strpos($arcHtml, '<br', $start);
+            if (!$end) {
+                $end = strpos($arcHtml, '</', $start);
+            };
+
+            if (!$end) {
+                throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
+            };
+        
+            $link->title = trim(substr($arcHtml, $start, $end - $start));
+        
+            $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
+
+            array_push($comicPages, $link);
+            Log::debug($link);
+
+            usleep(250000); // don't be too rude
+        };
+    };
+
+    Log::info("Finished fetching " . count($comicPages) . " pages");
+    return $comicPages;
+};
--- a/lib/generate-feed.php
+++ b/lib/generate-feed.php
@ -9,26 +9,34 @@ function generateFeed($content) {
        'autoescape' => false
    ]);

-    $now = date('c');
-
+    $ts = time();
+    
    $items = [];
-    foreach ($content as $result) {
+    
+    foreach ($content as $page) {
+        $ts -= 1; // artificially enforce ordering on undated historical items
+        $now = date('c', $ts);
+        
+        $content = $twig->render('item-content.html', [
+            'url' => $page->href,
+            'imageUrl' => $page->imageUrl
+        ]);
+        
        $items[] = $twig->render('item.xml', [
-            'title' => $result['pageNumber'] . " | " . Config::feed['title'],
-            'url' => $result['url'],
-            'imageUrl' => $result['imageUrl'],
-            'date' => $now
+            'title' => htmlspecialchars($page->pageNo . " // " . $page->title),
+            'date' => $now,
+            'content' => htmlspecialchars($content)
        ]);
    };

    $feed = $twig->render('feed.xml', [
        'title' => Config::feed['title'],
        'description' => Config::feed['description'],
-        'baseUrl' => Config::baseURL,
+        'baseUrl' => Config::baseUrl,
        'feedUrl' => Config::feed['feedUrl'],
        'date' => $now,
        'items' => implode("\n", $items)
    ]);

    return $feed;
-};
+};
--- a/lib/parse-page.php
+++ b/lib/parse-page.php
@ -1,2 +0,0 @@
-<?php
-declare(strict_types = 1);
--- a/rssmonster.php
+++ b/rssmonster.php
@ -10,85 +10,10 @@ require_once('vendor/autoload.php');
 include_once('config/default.php');

 include_once('lib/log.php');
-// include_once('lib/fetch-pages.php');
-// include_once('lib/generate-feed.php');
+include_once('lib/fetch-site-content.php');

-// $content = fetchPages();
-// $feedXml = generateFeed($content);
-
-// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
-// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
-
-
-include_once('lib/fetch-url.php');
-
-class ComicPage {
-    public string $href = '';
-    public string $pageNo = '';
-    public string $title = '';
-    public string $imageUrl = '';
-    
-    function __construct() {}
-    function __toString() {
-        return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
-    }
-};
-
-$comicPages = array();
-
-$archivePage = fetchUrl(Config::archiveUrl);
-
-foreach ($archivePage->find('div.textcontent ul') as $arc) {
-    $arcHtml = $arc->innerHTML;
-    
-    $comicLinks = $arc->find('a');
-    foreach ($comicLinks as $el) {
-        $link = new ComicPage;
-        
-        $link->href = Config::baseUrl . "/" . $el->href;
-        $link->pageNo = $el->innerText;
-
-        // we'll get the image URL from this later, but fetch it early
-        // in order to check that the page exists; if not, we'll
-        // exclude it from the feed
-        try {
-            $comicPage = fetchUrl($link->href);
-        } catch (Exception $exc) {
-            if ($exc->response->getStatusCode() === 404) {
-                Log::warn("Got HTTP 404 from {$link->href}; skipping");
-                continue;
-            };
-            // rethrow unrecognized error
-            throw $exc;
-        };
-
-        // "why not a regex?" well buddy i don't know if you've ever
-        // seen regexes but string walking is kinder to the brainmeats
-        $start = strpos($arcHtml, $el->href);
-        do {
-            $start += 1;
-        } while (substr($arcHtml, $start, 4) != '</a>');
-
-        $start += 4; // '</a>'
-        $start += 4; // ' // '
-
-        $end = strpos($arcHtml, '<br', $start);
-        if (!$end) {
-            $end = strpos($arcHtml, '</', $start);
-        };
-
-        if (!$end) {
-            throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
-        };
-        
-        $link->title = trim(substr($arcHtml, $start, $end - $start));
-        
-        $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
-
-        array_push($comicPages, $link);
-        Log::debug($link);
-
-        usleep(250000); // don't be too rude
-    };
-};
+$content = fetchSiteContent();
+$feedXml = generateFeed(array_reverse($content));

+Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
+file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
--- a/templates/feed.xml
+++ b/templates/feed.xml
@ -1,24 +1,17 @@
 <?xml version="1.0" encoding="utf-8"?>
-<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
 {% autoescape %}
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
-    <title>
-      {{ title }}
-    </title>
-    <description>
-      {{ description }}
-    </description>
+    <title>{{ title }}</title>
+    <description>{{ description }}</description>
    <link>{{ baseUrl }}</link>
    <atom:link href="{{ feedUrl }}" rel="self" type="application/rss+xml" />
    <pubDate>{{ date }}</pubDate>
    <lastBuildDate>{{ date }}</lastBuildDate>
    <generator>{{ date }}</generator>
-    <items>
-      
 {% endautoescape %}
-      {{ items }}
-      
-    </items>
+    {{ items }}
+{% autoescape %}
  </channel>
-
 </rss>
+{% endautoescape %}
--- a/templates/item-content.html
+++ b/templates/item-content.html
@ -0,0 +1,3 @@
+<a style="border: 0; text-decoration: none;" href="{{ url }}">
+  <img src="{{ imageUrl }}" />
+</a>
--- a/templates/item.xml
+++ b/templates/item.xml
@ -1,10 +1,6 @@
 <item>
  <title>{{ title }}</title>
-  <description>
-    <a style="border: 0; text-decoration: none;" href="{{ url }}">
-      <img src="{{ imageUrl }}" />
-    </a>
-  </description>
+  <description>{{ content }}</description>
  <pubDate>{{ date }}</pubDate>
  <link>{{ url }}</link>
  <guid isPermaLink="true">{{ url }}</guid>