Read + parse archive page

2022-03-13 21:41:45 -04:00 · 2022-03-13 21:41:45 -04:00 · 8d31c7ea29
parent d7fd21f338
commit 8d31c7ea29
3 changed files with 90 additions and 10 deletions
--- a/config/default.php
+++ b/config/default.php
@ -10,13 +10,14 @@ class Config {
    // where to write the feed file
    const feedPath = '/tmp/feed.xml';
    
-    // where to start spidering
-    const baseURL = "http://www.inhuman-comic.com";
+    // base URL of the site
+    const baseUrl = "http://www.inhuman-comic.com";
+    // archive page URL
+    const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
+    
    // how the script identifies itself to the server while spidering
    // (this will also be used as the "Generator" value in the feed XML)
    const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
-    // a CSS selector identifying the "next page" link to follow while spidering
-    const nextLinkSelector = 'div.body a.prev';
    // which log levels to emit (true) or suppress (false)
    // nothing below 'info' should be important in normal operation
    const logLevels = [
--- a/lib/fetch-url.php
+++ b/lib/fetch-url.php
@ -0,0 +1,29 @@
+<?php
+declare(strict_types = 1);
+
+include_once('config/default.php');
+
+use GuzzleHttp\Client as HTTPClient;
+use PHPHtmlParser\Dom as DOM;
+
+function fetchUrl(string $url) {
+    $client = new HTTPClient();
+
+    $res = $client->request(
+        'GET', $url, [
+            'http_errors' => false,
+            'headers' => [
+                'User-Agent' => Config::userAgent
+            ]
+        ]
+    );
+
+    if ($res->getStatusCode() !== 200) {
+        Log::debug($res->getBody());
+        throw "Request for $url returned {$res->getStatusCode()}";
+    };
+
+    $dom = new DOM;
+    $dom->loadStr(strval($res->getBody()));
+    return $dom;
+};
--- a/rssmonster.php
+++ b/rssmonster.php
@ -10,11 +10,61 @@ require_once('vendor/autoload.php');
 include_once('config/default.php');

 include_once('lib/log.php');
-include_once('lib/fetch-pages.php');
-include_once('lib/generate-feed.php');
+// include_once('lib/fetch-pages.php');
+// include_once('lib/generate-feed.php');

-$content = fetchPages();
-$feedXml = generateFeed($content);
+// $content = fetchPages();
+// $feedXml = generateFeed($content);
+
+// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
+// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
+
+
+include_once('lib/fetch-url.php');
+$archivePage = fetchUrl(Config::archiveUrl);
+
+class ComicPageLink {
+    public string $href = '';
+    public string $pageNo = '';
+    public string $title = '';
+    function __construct() {}
+    function __toString() {
+        return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
+    }
+};
+
+$comicPageLinks = array();
+
+foreach ($archivePage->find('div.textcontent ul') as $arc) {
+    $arcHtml = $arc->innerHTML;
+    
+    $comicLinks = $arc->find('a');
+    foreach ($comicLinks as $el) {
+        $link = new ComicPageLink;
+        
+        $start = strpos($arcHtml, $el->href);
+        do {
+            $start += 1;
+        } while (substr($arcHtml, $start, 4) != '</a>');
+
+        $start += 4; // '</a>'
+        $start += 4; // ' // '
+
+        $end = strpos($arcHtml, '<br', $start);
+        if (!$end) {
+            $end = strpos($arcHtml, '</', $start);
+        };
+
+        if (!$end) {
+            throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
+        };
+        
+        $link->title = trim(substr($arcHtml, $start, $end - $start));
+
+        $link->href = Config::baseUrl . "/" . $el->href;
+        $link->pageNo = $el->innerText;
+        
+        print $link;
+    };
+};

-Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
-file_put_contents(Config::feedPath, $feedXml, LOCK_EX);