From 8d31c7ea2972552735ecc83925885ce51b2a1646 Mon Sep 17 00:00:00 2001 From: Alexis Marie Wright Date: Sun, 13 Mar 2022 21:41:45 -0400 Subject: [PATCH] Read + parse archive page --- config/default.php | 9 ++++--- lib/fetch-url.php | 29 ++++++++++++++++++++++ rssmonster.php | 62 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 lib/fetch-url.php diff --git a/config/default.php b/config/default.php index 588bbb5..d95e0e8 100644 --- a/config/default.php +++ b/config/default.php @@ -10,13 +10,14 @@ class Config { // where to write the feed file const feedPath = '/tmp/feed.xml'; - // where to start spidering - const baseURL = "http://www.inhuman-comic.com"; + // base URL of the site + const baseUrl = "http://www.inhuman-comic.com"; + // archive page URL + const archiveUrl = 'http://www.inhuman-comic.com/archives.php'; + // how the script identifies itself to the server while spidering // (this will also be used as the "Generator" value in the feed XML) const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)'; - // a CSS selector identifying the "next page" link to follow while spidering - const nextLinkSelector = 'div.body a.prev'; // which log levels to emit (true) or suppress (false) // nothing below 'info' should be important in normal operation const logLevels = [ diff --git a/lib/fetch-url.php b/lib/fetch-url.php new file mode 100644 index 0000000..1877f92 --- /dev/null +++ b/lib/fetch-url.php @@ -0,0 +1,29 @@ +request( + 'GET', $url, [ + 'http_errors' => false, + 'headers' => [ + 'User-Agent' => Config::userAgent + ] + ] + ); + + if ($res->getStatusCode() !== 200) { + Log::debug($res->getBody()); + throw "Request for $url returned {$res->getStatusCode()}"; + }; + + $dom = new DOM; + $dom->loadStr(strval($res->getBody())); + return $dom; +}; diff --git a/rssmonster.php b/rssmonster.php index 5946cde..cca578a 100644 --- a/rssmonster.php +++ b/rssmonster.php @@ -10,11 +10,61 @@ require_once('vendor/autoload.php'); include_once('config/default.php'); include_once('lib/log.php'); -include_once('lib/fetch-pages.php'); -include_once('lib/generate-feed.php'); +// include_once('lib/fetch-pages.php'); +// include_once('lib/generate-feed.php'); -$content = fetchPages(); -$feedXml = generateFeed($content); +// $content = fetchPages(); +// $feedXml = generateFeed($content); + +// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath); +// file_put_contents(Config::feedPath, $feedXml, LOCK_EX); + + +include_once('lib/fetch-url.php'); +$archivePage = fetchUrl(Config::archiveUrl); + +class ComicPageLink { + public string $href = ''; + public string $pageNo = ''; + public string $title = ''; + function __construct() {} + function __toString() { + return "[{$this->href}] {$this->pageNo} // {$this->title}\n"; + } +}; + +$comicPageLinks = array(); + +foreach ($archivePage->find('div.textcontent ul') as $arc) { + $arcHtml = $arc->innerHTML; + + $comicLinks = $arc->find('a'); + foreach ($comicLinks as $el) { + $link = new ComicPageLink; + + $start = strpos($arcHtml, $el->href); + do { + $start += 1; + } while (substr($arcHtml, $start, 4) != ''); + + $start += 4; // '' + $start += 4; // ' // ' + + $end = strpos($arcHtml, 'href})"); + }; + + $link->title = trim(substr($arcHtml, $start, $end - $start)); + + $link->href = Config::baseUrl . "/" . $el->href; + $link->pageNo = $el->innerText; + + print $link; + }; +}; -Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath); -file_put_contents(Config::feedPath, $feedXml, LOCK_EX);