106 lines
3.2 KiB
PHP
106 lines
3.2 KiB
PHP
<?php
|
|
declare(strict_types = 1);
|
|
|
|
include_once('config/default.php');
|
|
|
|
include_once('lib/log.php');
|
|
include_once('lib/fetch-url.php');
|
|
include_once('lib/comic-page.php');
|
|
|
|
function fetchSiteContent($knownContent = array()) {
|
|
$comicPages = array();
|
|
$knownPageUrls = array();
|
|
$nextSerial = 0;
|
|
|
|
if ($knownContent) {
|
|
foreach ($knownContent as $knownItem) {
|
|
$knownPageUrls[$knownItem->href] = $knownItem;
|
|
|
|
if ($knownItem->serial > $nextSerial) {
|
|
$nextSerial = $knownItem->serial;
|
|
};
|
|
};
|
|
};
|
|
|
|
$nextSerial += 1;
|
|
|
|
$archivePage = fetchUrl(Config::archiveUrl);
|
|
|
|
foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
|
$arcHtml = $arc->innerHTML;
|
|
|
|
$comicLinks = $arc->find('a');
|
|
foreach ($comicLinks as $el) {
|
|
$pageUrl = Config::baseUrl . "/" . $el->href;
|
|
|
|
if (@$knownPageUrls[$pageUrl]) {
|
|
Log::debug("Using cached version of " . $pageUrl);
|
|
array_push($comicPages, $knownPageUrls[$pageUrl]);
|
|
continue;
|
|
};
|
|
|
|
$link = new ComicPage;
|
|
|
|
$link->href = $pageUrl;
|
|
$link->pageNo = $el->innerText;
|
|
|
|
// we'll get the image URL from this later, but fetch it early
|
|
// in order to check that the page exists; if not, we'll
|
|
// exclude it from the feed
|
|
try {
|
|
$comicPage = fetchUrl($link->href);
|
|
} catch (Exception $exc) {
|
|
if ($exc->response->getStatusCode() === 404) {
|
|
Log::warn("Got HTTP 404 from {$link->href}; skipping");
|
|
continue;
|
|
};
|
|
// rethrow unrecognized error
|
|
throw $exc;
|
|
};
|
|
|
|
// "why not a regex?" well buddy i don't know if you've ever
|
|
// seen regexes but string walking is kinder to the brainmeats
|
|
$start = strpos($arcHtml, $el->href);
|
|
do {
|
|
$start += 1;
|
|
} while (substr($arcHtml, $start, 4) != '</a>');
|
|
|
|
$start += 4; // '</a>'
|
|
$start += 4; // ' // '
|
|
|
|
$end = strpos($arcHtml, '<br', $start);
|
|
if (!$end) {
|
|
$end = strpos($arcHtml, '</', $start);
|
|
};
|
|
|
|
if (!$end) {
|
|
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
|
|
};
|
|
|
|
$link->title = trim(substr($arcHtml, $start, $end - $start));
|
|
|
|
$link->imageUrl = $comicPage->find('div.page img')[0]->src;
|
|
|
|
if (!$link->imageUrl) {
|
|
throw new Exception("Failed to find an image in {$el} ({$el->href})");
|
|
};
|
|
|
|
// we wait to set the serial (and increment nextSerial)
|
|
// until we know we're going to add this item to the
|
|
// content list
|
|
$link->serial = $nextSerial++;
|
|
|
|
$link->imageUrl = Config::baseUrl . '/' . $link->imageUrl;
|
|
|
|
array_push($comicPages, $link);
|
|
Log::info("Fetched " . $link->href);
|
|
Log::silly($link);
|
|
|
|
usleep(250000); // don't be too rude
|
|
};
|
|
};
|
|
|
|
Log::info("Finished fetching " . count($comicPages) . " pages");
|
|
return $comicPages;
|
|
};
|