139 lines
4.4 KiB
PHP
139 lines
4.4 KiB
PHP
<?php
|
|
declare(strict_types = 1);
|
|
|
|
include_once('config/default.php');
|
|
|
|
use GuzzleHttp\Client as HTTPClient;
|
|
use PHPHtmlParser\Dom as DOM;
|
|
|
|
function fetchUrl(string $url) {
|
|
$client = new HTTPClient();
|
|
|
|
$res = $client->request(
|
|
'GET', $url, [
|
|
'http_errors' => false,
|
|
'headers' => [
|
|
'User-Agent' => Config::userAgent
|
|
]
|
|
]
|
|
);
|
|
|
|
if ($res->getStatusCode() !== 200) {
|
|
Log::debug($res->getBody());
|
|
throw "Request for $url returned {$res->getStatusCode()}";
|
|
};
|
|
|
|
return strval($res->getBody());
|
|
};
|
|
|
|
function fetchPages() {
|
|
$urls = [Config::baseURL];
|
|
$fetchedUrls = [];
|
|
$content = [];
|
|
|
|
while (count($urls) > 0) {
|
|
$url = array_shift($urls);
|
|
|
|
$res = fetchUrl($url);
|
|
Log::info("$url: fetched " . strlen($res) . " bytes");
|
|
|
|
Log::silly($res);
|
|
|
|
array_push($fetchedUrls, $url);
|
|
|
|
$dom = new DOM;
|
|
$dom->loadStr($res);
|
|
|
|
$pageNumber = null;
|
|
|
|
// extract page number (if any) from this page's URL
|
|
if ($url === Config::baseURL) {
|
|
Log::debug("$url is the root; will patch up page number later");
|
|
} else {
|
|
$matches = array();
|
|
|
|
if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
|
|
$pageNumber = $matches[1];
|
|
} else {
|
|
Log::debug("failed to find a page number in $url, trying special cases");
|
|
|
|
if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
|
|
$pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
|
|
}
|
|
elseif (preg_match('/cover13/', $url)) {
|
|
$pageNumber = 'Arc 13: Front Cover';
|
|
}
|
|
elseif (preg_match('/back12/', $url)) {
|
|
$pageNumber = 'Arc 12: Back Cover';
|
|
}
|
|
elseif (preg_match('/arc12splash/', $url)) {
|
|
$pageNumber = 'Arc 12: Front Cover';
|
|
}
|
|
elseif (preg_match('/arc11/', $url)) {
|
|
$pageNumber = 'Arc 11: Front Cover';
|
|
}
|
|
elseif (preg_match('/back10/', $url)) {
|
|
$pageNumber = 'Arc 10: Back Cover';
|
|
}
|
|
elseif (preg_match('/splash10/', $url)) {
|
|
$pageNumber = 'Arc 10: Cover';
|
|
}
|
|
elseif (preg_match('/arc9/', $url)) {
|
|
$pageNumber = 'Arc 9: Cover';
|
|
}
|
|
elseif (preg_match('/arc8/', $url)) {
|
|
$pageNumber = 'Arc 8: Cover';
|
|
};
|
|
};
|
|
|
|
if ($pageNumber === null) {
|
|
throw new Exception("failed to find a page number in $url");
|
|
};
|
|
};
|
|
|
|
array_push($content, [
|
|
'url' => $url,
|
|
'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
|
|
'body' => $dom,
|
|
'pageNumber' => $pageNumber
|
|
]);
|
|
|
|
// extract next page URL from this page's content
|
|
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
|
|
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
|
|
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
|
|
break;
|
|
};
|
|
|
|
$nextUrl = Config::baseURL . '/' . $el->href;
|
|
|
|
if (array_search($nextUrl, $fetchedUrls)) {
|
|
Log::debug("$url: already fetched next link $nextUrl");
|
|
continue;
|
|
};
|
|
|
|
if (array_search($nextUrl, $urls)) {
|
|
Log::debug("$url: duplicate next link $nextUrl");
|
|
continue;
|
|
};
|
|
|
|
Log::info("$url: next link $nextUrl");
|
|
array_push($urls, $nextUrl);
|
|
};
|
|
|
|
usleep(500000); // wait half a second, to be minimally polite
|
|
};
|
|
|
|
// patch up the page number on the title page by adding 1 to the
|
|
// first numeric page number we find, starting from the newest
|
|
// non-root page
|
|
foreach (array_slice($content, 1) as $page) {
|
|
if (preg_match('/^\d+$/', $page['pageNumber'])) {
|
|
$content[0]['pageNumber'] = $page['pageNumber'] + 1;
|
|
Log::info("patched up root page number to " . $content[0]['pageNumber']);
|
|
break;
|
|
};
|
|
};
|
|
|
|
return $content;
|
|
} |