From 3d580f4838e60459fc4ae6ddf6dd5645f9f36a3d Mon Sep 17 00:00:00 2001
From: Alexis Marie Wright <lexie@alexis-marie-wright.me>
Date: Fri, 4 Mar 2022 20:10:22 -0500
Subject: [PATCH] Cleanup & result capture

---
 lib/fetch-pages.php | 86 +++++++++++++++++++++++++++++++++++++++++++++
 rssmonster.php      | 80 ++++++++++-------------------------------
 2 files changed, 105 insertions(+), 61 deletions(-)
 create mode 100644 lib/fetch-pages.php

diff --git a/lib/fetch-pages.php b/lib/fetch-pages.php
new file mode 100644
index 0000000..fb8872a
--- /dev/null
+++ b/lib/fetch-pages.php
@@ -0,0 +1,86 @@
+<?php
+
+use GuzzleHttp\Client as HTTPClient;
+use PHPHtmlParser\Dom as DOM;
+
+function fetchUrl(string $url) {
+    $client = new HTTPClient();
+
+    $res = $client->request(
+        'GET', $url, [
+            'http_errors' => false,
+            'headers' => [
+                'User-Agent' => Config::userAgent
+            ]
+        ]
+    );
+
+    if ($res->getStatusCode() !== 200) {
+        Log::debug($res->getBody());
+        throw "Request for $url returned {$res->getStatusCode()}";
+    };
+
+    return strval($res->getBody());
+};
+
+function fetchPages() {
+    $urls = [Config::baseURL];
+    $fetchedUrls = [];
+    $content = [];
+
+    while (count($urls) > 0) {
+        $url = array_shift($urls);
+    
+        $res = fetchUrl($url);
+        Log::info("$url: fetched " . strlen($res) . " bytes");
+
+        Log::silly($res);
+    
+        array_push($fetchedUrls, $url);
+
+        $dom = new DOM;
+        $dom->loadStr($res);
+
+        // extract page number (if any) from this page's URL
+        $matches = array(null, null);
+        if ($url !== Config::baseURL && 0 === preg_match('/comic(\d+)\.php$/', $url, $matches)) {
+            throw new Exception("failed to find a page number in $url");
+        };
+
+        array_push($content, [
+            'url' => $url,
+            'body' => $dom,
+            'pageNumber' => $matches[1]
+        ]);
+
+        // extract next page URL from this page's content
+        foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
+            if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
+                Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
+                break;
+            };
+        
+            $nextUrl = Config::baseURL . '/' . $el->href;
+        
+            if (array_search($nextUrl, $fetchedUrls)) {
+                Log::debug("$url: already fetched next link $nextUrl");
+                continue;
+            };
+
+            if (array_search($nextUrl, $urls)) {
+                Log::debug("$url: duplicate next link $nextUrl");
+                continue;
+            };
+        
+            Log::info("$url: next link $nextUrl");
+            array_push($urls, $nextUrl);
+        };
+
+        usleep(500000); // wait half a second, to be minimally polite
+    };
+
+    // patch up the page number on the title page
+    $content[0]['pageNumber'] = $content[1]['pageNumber'] + 1;
+
+    return $content;
+}
\ No newline at end of file
diff --git a/rssmonster.php b/rssmonster.php
index bf5cd86..f0ce8f9 100644
--- a/rssmonster.php
+++ b/rssmonster.php
@@ -7,71 +7,29 @@ error_reporting(E_ALL & ~E_DEPRECATED);
 
 require_once('vendor/autoload.php');
 
-use GuzzleHttp\Client as HTTPClient;
-use PHPHtmlParser\Dom as DOM;
-
 include_once('config/default.php');
 
 include_once('lib/log.php');
+include_once('lib/fetch-pages.php');
 
-function fetchUrl(string $url) {
-    $client = new HTTPClient();
-
-    $res = $client->request(
-        'GET', $url, [
-            'http_errors' => false,
-            'headers' => [
-                'User-Agent' => Config::userAgent
-            ]
-        ]
-    );
-
-    if ($res->getStatusCode() !== 200) {
-        Log::debug($res->getBody());
-        throw "Request for $url returned {$res->getStatusCode()}";
-    };
-
-    return strval($res->getBody());
-};
-
-$urls = [Config::baseURL];
-$fetchedUrls = [];
-
-while (count($urls) > 0) {
-    $url = array_shift($urls);
+// --- this'll be in a lib someday
+function parsePageForFeedContent(string $url, DOM $page) {
+    $imageUrl = Config::baseUrl . '/' . $page->find('div.page img')[0]->src;
     
-    $res = fetchUrl($url);
-    Log::info("$url: fetched " . strlen($res) . " bytes");
-
-    Log::silly($res);
-    
-    array_push($fetchedUrls, $url);
-
-    $dom = new DOM;
-    $dom->loadStr($res);
-
-    foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
-        if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
-            Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
-            break;
-        };
-        
-        $nextUrl = Config::baseURL . '/' . $el->href;
-        
-        if (array_search($nextUrl, $fetchedUrls)) {
-            Log::debug("$url: already fetched next link $nextUrl");
-            continue;
-        };
-
-        if (array_search($nextUrl, $urls)) {
-            Log::debug("$url: duplicate next link $nextUrl");
-            continue;
-        };
-        
-        Log::info("$url: next link $nextUrl");
-        array_push($urls, $nextUrl);
-    };
-
-    usleep(500000); // wait half a second, to be minimally polite
+    return [
+        title => 'INHUMAN | Page ###',
+        link => $url,
+        guid => $url,
+        description =>
+        "<a style=\"text-decoration: none; border: none;\" href=\"" . $url . "\">" .
+        "<img src=\"" . $imageUrl . "\" />" .
+        "</a>"
+    ];
 };
+// --- ok back to the real shit
 
+$content = fetchPages();
+
+foreach ($content as $result) {
+    print "{$result['url']} {$result['pageNumber']}\n";
+};
\ No newline at end of file