Cleanup & result capture
This commit is contained in:
parent
8a7de04dde
commit
3d580f4838
|
@ -0,0 +1,86 @@
|
|||
<?php
|
||||
|
||||
use GuzzleHttp\Client as HTTPClient;
|
||||
use PHPHtmlParser\Dom as DOM;
|
||||
|
||||
function fetchUrl(string $url) {
|
||||
$client = new HTTPClient();
|
||||
|
||||
$res = $client->request(
|
||||
'GET', $url, [
|
||||
'http_errors' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => Config::userAgent
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
if ($res->getStatusCode() !== 200) {
|
||||
Log::debug($res->getBody());
|
||||
throw "Request for $url returned {$res->getStatusCode()}";
|
||||
};
|
||||
|
||||
return strval($res->getBody());
|
||||
};
|
||||
|
||||
function fetchPages() {
|
||||
$urls = [Config::baseURL];
|
||||
$fetchedUrls = [];
|
||||
$content = [];
|
||||
|
||||
while (count($urls) > 0) {
|
||||
$url = array_shift($urls);
|
||||
|
||||
$res = fetchUrl($url);
|
||||
Log::info("$url: fetched " . strlen($res) . " bytes");
|
||||
|
||||
Log::silly($res);
|
||||
|
||||
array_push($fetchedUrls, $url);
|
||||
|
||||
$dom = new DOM;
|
||||
$dom->loadStr($res);
|
||||
|
||||
// extract page number (if any) from this page's URL
|
||||
$matches = array(null, null);
|
||||
if ($url !== Config::baseURL && 0 === preg_match('/comic(\d+)\.php$/', $url, $matches)) {
|
||||
throw new Exception("failed to find a page number in $url");
|
||||
};
|
||||
|
||||
array_push($content, [
|
||||
'url' => $url,
|
||||
'body' => $dom,
|
||||
'pageNumber' => $matches[1]
|
||||
]);
|
||||
|
||||
// extract next page URL from this page's content
|
||||
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
|
||||
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
|
||||
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
|
||||
break;
|
||||
};
|
||||
|
||||
$nextUrl = Config::baseURL . '/' . $el->href;
|
||||
|
||||
if (array_search($nextUrl, $fetchedUrls)) {
|
||||
Log::debug("$url: already fetched next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
if (array_search($nextUrl, $urls)) {
|
||||
Log::debug("$url: duplicate next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
Log::info("$url: next link $nextUrl");
|
||||
array_push($urls, $nextUrl);
|
||||
};
|
||||
|
||||
usleep(500000); // wait half a second, to be minimally polite
|
||||
};
|
||||
|
||||
// patch up the page number on the title page
|
||||
$content[0]['pageNumber'] = $content[1]['pageNumber'] + 1;
|
||||
|
||||
return $content;
|
||||
}
|
|
@ -7,71 +7,29 @@ error_reporting(E_ALL & ~E_DEPRECATED);
|
|||
|
||||
require_once('vendor/autoload.php');
|
||||
|
||||
use GuzzleHttp\Client as HTTPClient;
|
||||
use PHPHtmlParser\Dom as DOM;
|
||||
|
||||
include_once('config/default.php');
|
||||
|
||||
include_once('lib/log.php');
|
||||
include_once('lib/fetch-pages.php');
|
||||
|
||||
function fetchUrl(string $url) {
|
||||
$client = new HTTPClient();
|
||||
|
||||
$res = $client->request(
|
||||
'GET', $url, [
|
||||
'http_errors' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => Config::userAgent
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
if ($res->getStatusCode() !== 200) {
|
||||
Log::debug($res->getBody());
|
||||
throw "Request for $url returned {$res->getStatusCode()}";
|
||||
};
|
||||
|
||||
return strval($res->getBody());
|
||||
};
|
||||
|
||||
$urls = [Config::baseURL];
|
||||
$fetchedUrls = [];
|
||||
|
||||
while (count($urls) > 0) {
|
||||
$url = array_shift($urls);
|
||||
// --- this'll be in a lib someday
|
||||
function parsePageForFeedContent(string $url, DOM $page) {
|
||||
$imageUrl = Config::baseUrl . '/' . $page->find('div.page img')[0]->src;
|
||||
|
||||
$res = fetchUrl($url);
|
||||
Log::info("$url: fetched " . strlen($res) . " bytes");
|
||||
|
||||
Log::silly($res);
|
||||
|
||||
array_push($fetchedUrls, $url);
|
||||
|
||||
$dom = new DOM;
|
||||
$dom->loadStr($res);
|
||||
|
||||
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
|
||||
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
|
||||
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
|
||||
break;
|
||||
};
|
||||
|
||||
$nextUrl = Config::baseURL . '/' . $el->href;
|
||||
|
||||
if (array_search($nextUrl, $fetchedUrls)) {
|
||||
Log::debug("$url: already fetched next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
if (array_search($nextUrl, $urls)) {
|
||||
Log::debug("$url: duplicate next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
Log::info("$url: next link $nextUrl");
|
||||
array_push($urls, $nextUrl);
|
||||
};
|
||||
|
||||
usleep(500000); // wait half a second, to be minimally polite
|
||||
return [
|
||||
title => 'INHUMAN | Page ###',
|
||||
link => $url,
|
||||
guid => $url,
|
||||
description =>
|
||||
"<a style=\"text-decoration: none; border: none;\" href=\"" . $url . "\">" .
|
||||
"<img src=\"" . $imageUrl . "\" />" .
|
||||
"</a>"
|
||||
];
|
||||
};
|
||||
// --- ok back to the real shit
|
||||
|
||||
$content = fetchPages();
|
||||
|
||||
foreach ($content as $result) {
|
||||
print "{$result['url']} {$result['pageNumber']}\n";
|
||||
};
|
Loading…
Reference in New Issue