Temp before using archive page

This commit is contained in:
Alexis Marie Wright 2022-03-13 21:09:19 -04:00
parent ef399a7c1c
commit d7fd21f338
3 changed files with 61 additions and 7 deletions

View File

@ -20,10 +20,11 @@ class Config {
// which log levels to emit (true) or suppress (false)
// nothing below 'info' should be important in normal operation
const logLevels = [
'warn' => true,
'info' => true,
'debug' => true,
'silly' => false
];
// if set, fetch only this many pages while spidering (false to fetch all)
const TEST_stopAfter = 2;
const TEST_stopAfter = false;
}

View File

@ -44,17 +44,58 @@ function fetchPages() {
$dom = new DOM;
$dom->loadStr($res);
$pageNumber = null;
// extract page number (if any) from this page's URL
$matches = array(null, null);
if ($url !== Config::baseURL && 0 === preg_match('/comic(\d+)\.php$/', $url, $matches)) {
throw new Exception("failed to find a page number in $url");
if ($url === Config::baseURL) {
Log::debug("$url is the root; will patch up page number later");
} else {
$matches = array();
if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
$pageNumber = $matches[1];
} else {
Log::debug("failed to find a page number in $url, trying special cases");
if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
$pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
}
elseif (preg_match('/cover13/', $url)) {
$pageNumber = 'Arc 13: Front Cover';
}
elseif (preg_match('/back12/', $url)) {
$pageNumber = 'Arc 12: Back Cover';
}
elseif (preg_match('/arc12splash/', $url)) {
$pageNumber = 'Arc 12: Front Cover';
}
elseif (preg_match('/arc11/', $url)) {
$pageNumber = 'Arc 11: Front Cover';
}
elseif (preg_match('/back10/', $url)) {
$pageNumber = 'Arc 10: Back Cover';
}
elseif (preg_match('/splash10/', $url)) {
$pageNumber = 'Arc 10: Cover';
}
elseif (preg_match('/arc9/', $url)) {
$pageNumber = 'Arc 9: Cover';
}
elseif (preg_match('/arc8/', $url)) {
$pageNumber = 'Arc 8: Cover';
};
};
if ($pageNumber === null) {
throw new Exception("failed to find a page number in $url");
};
};
array_push($content, [
'url' => $url,
'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
'body' => $dom,
'pageNumber' => $matches[1]
'pageNumber' => $pageNumber
]);
// extract next page URL from this page's content
@ -83,8 +124,16 @@ function fetchPages() {
usleep(500000); // wait half a second, to be minimally polite
};
// patch up the page number on the title page
$content[0]['pageNumber'] = $content[1]['pageNumber'] + 1;
// patch up the page number on the title page by adding 1 to the
// first numeric page number we find, starting from the newest
// non-root page
foreach (array_slice($content, 1) as $page) {
if (preg_match('/^\d+$/', $page['pageNumber'])) {
$content[0]['pageNumber'] = $page['pageNumber'] + 1;
Log::info("patched up root page number to " . $content[0]['pageNumber']);
break;
};
};
return $content;
}

View File

@ -24,5 +24,9 @@ class Log {
public static function info($message) {
Log::emit('info', $message);
}
public static function warn($message) {
Log::emit('warn', $message);
}
};