From 3d580f4838e60459fc4ae6ddf6dd5645f9f36a3d Mon Sep 17 00:00:00 2001 From: Alexis Marie Wright Date: Fri, 4 Mar 2022 20:10:22 -0500 Subject: [PATCH] Cleanup & result capture --- lib/fetch-pages.php | 86 +++++++++++++++++++++++++++++++++++++++++++++ rssmonster.php | 80 ++++++++++------------------------------- 2 files changed, 105 insertions(+), 61 deletions(-) create mode 100644 lib/fetch-pages.php diff --git a/lib/fetch-pages.php b/lib/fetch-pages.php new file mode 100644 index 0000000..fb8872a --- /dev/null +++ b/lib/fetch-pages.php @@ -0,0 +1,86 @@ +request( + 'GET', $url, [ + 'http_errors' => false, + 'headers' => [ + 'User-Agent' => Config::userAgent + ] + ] + ); + + if ($res->getStatusCode() !== 200) { + Log::debug($res->getBody()); + throw "Request for $url returned {$res->getStatusCode()}"; + }; + + return strval($res->getBody()); +}; + +function fetchPages() { + $urls = [Config::baseURL]; + $fetchedUrls = []; + $content = []; + + while (count($urls) > 0) { + $url = array_shift($urls); + + $res = fetchUrl($url); + Log::info("$url: fetched " . strlen($res) . " bytes"); + + Log::silly($res); + + array_push($fetchedUrls, $url); + + $dom = new DOM; + $dom->loadStr($res); + + // extract page number (if any) from this page's URL + $matches = array(null, null); + if ($url !== Config::baseURL && 0 === preg_match('/comic(\d+)\.php$/', $url, $matches)) { + throw new Exception("failed to find a page number in $url"); + }; + + array_push($content, [ + 'url' => $url, + 'body' => $dom, + 'pageNumber' => $matches[1] + ]); + + // extract next page URL from this page's content + foreach (array($dom->find(Config::nextLinkSelector)) as $el) { + if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) { + Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches"); + break; + }; + + $nextUrl = Config::baseURL . '/' . $el->href; + + if (array_search($nextUrl, $fetchedUrls)) { + Log::debug("$url: already fetched next link $nextUrl"); + continue; + }; + + if (array_search($nextUrl, $urls)) { + Log::debug("$url: duplicate next link $nextUrl"); + continue; + }; + + Log::info("$url: next link $nextUrl"); + array_push($urls, $nextUrl); + }; + + usleep(500000); // wait half a second, to be minimally polite + }; + + // patch up the page number on the title page + $content[0]['pageNumber'] = $content[1]['pageNumber'] + 1; + + return $content; +} \ No newline at end of file diff --git a/rssmonster.php b/rssmonster.php index bf5cd86..f0ce8f9 100644 --- a/rssmonster.php +++ b/rssmonster.php @@ -7,71 +7,29 @@ error_reporting(E_ALL & ~E_DEPRECATED); require_once('vendor/autoload.php'); -use GuzzleHttp\Client as HTTPClient; -use PHPHtmlParser\Dom as DOM; - include_once('config/default.php'); include_once('lib/log.php'); +include_once('lib/fetch-pages.php'); -function fetchUrl(string $url) { - $client = new HTTPClient(); - - $res = $client->request( - 'GET', $url, [ - 'http_errors' => false, - 'headers' => [ - 'User-Agent' => Config::userAgent - ] - ] - ); - - if ($res->getStatusCode() !== 200) { - Log::debug($res->getBody()); - throw "Request for $url returned {$res->getStatusCode()}"; - }; - - return strval($res->getBody()); -}; - -$urls = [Config::baseURL]; -$fetchedUrls = []; - -while (count($urls) > 0) { - $url = array_shift($urls); +// --- this'll be in a lib someday +function parsePageForFeedContent(string $url, DOM $page) { + $imageUrl = Config::baseUrl . '/' . $page->find('div.page img')[0]->src; - $res = fetchUrl($url); - Log::info("$url: fetched " . strlen($res) . " bytes"); - - Log::silly($res); - - array_push($fetchedUrls, $url); - - $dom = new DOM; - $dom->loadStr($res); - - foreach (array($dom->find(Config::nextLinkSelector)) as $el) { - if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) { - Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches"); - break; - }; - - $nextUrl = Config::baseURL . '/' . $el->href; - - if (array_search($nextUrl, $fetchedUrls)) { - Log::debug("$url: already fetched next link $nextUrl"); - continue; - }; - - if (array_search($nextUrl, $urls)) { - Log::debug("$url: duplicate next link $nextUrl"); - continue; - }; - - Log::info("$url: next link $nextUrl"); - array_push($urls, $nextUrl); - }; - - usleep(500000); // wait half a second, to be minimally polite + return [ + title => 'INHUMAN | Page ###', + link => $url, + guid => $url, + description => + "" . + "" . + "" + ]; }; +// --- ok back to the real shit +$content = fetchPages(); + +foreach ($content as $result) { + print "{$result['url']} {$result['pageNumber']}\n"; +}; \ No newline at end of file