rssmonster/lib/fetch-site-content.php

106 lines
3.2 KiB
PHP

<?php
declare(strict_types = 1);
include_once('config/default.php');
include_once('lib/log.php');
include_once('lib/fetch-url.php');
include_once('lib/comic-page.php');
function fetchSiteContent($knownContent = array()) {
$comicPages = array();
$knownPageUrls = array();
$nextSerial = 0;
if ($knownContent) {
foreach ($knownContent as $knownItem) {
$knownPageUrls[$knownItem->href] = $knownItem;
if ($knownItem->serial > $nextSerial) {
$nextSerial = $knownItem->serial;
};
};
};
$nextSerial += 1;
$archivePage = fetchUrl(Config::archiveUrl);
foreach ($archivePage->find('div.textcontent ul') as $arc) {
$arcHtml = $arc->innerHTML;
$comicLinks = $arc->find('a');
foreach ($comicLinks as $el) {
$pageUrl = Config::baseUrl . "/" . $el->href;
if (@$knownPageUrls[$pageUrl]) {
Log::debug("Using cached version of " . $pageUrl);
array_push($comicPages, $knownPageUrls[$pageUrl]);
continue;
};
$link = new ComicPage;
$link->href = $pageUrl;
$link->pageNo = $el->innerText;
// we'll get the image URL from this later, but fetch it early
// in order to check that the page exists; if not, we'll
// exclude it from the feed
try {
$comicPage = fetchUrl($link->href);
} catch (Exception $exc) {
if ($exc->response->getStatusCode() === 404) {
Log::warn("Got HTTP 404 from {$link->href}; skipping");
continue;
};
// rethrow unrecognized error
throw $exc;
};
// "why not a regex?" well buddy i don't know if you've ever
// seen regexes but string walking is kinder to the brainmeats
$start = strpos($arcHtml, $el->href);
do {
$start += 1;
} while (substr($arcHtml, $start, 4) != '</a>');
$start += 4; // '</a>'
$start += 4; // ' // '
$end = strpos($arcHtml, '<br', $start);
if (!$end) {
$end = strpos($arcHtml, '</', $start);
};
if (!$end) {
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
};
$link->title = trim(substr($arcHtml, $start, $end - $start));
$link->imageUrl = $comicPage->find('div.page img')[0]->src;
if (!$link->imageUrl) {
throw new Exception("Failed to find an image in {$el} ({$el->href})");
};
// we wait to set the serial (and increment nextSerial)
// until we know we're going to add this item to the
// content list
$link->serial = $nextSerial++;
$link->imageUrl = Config::baseUrl . '/' . $link->imageUrl;
array_push($comicPages, $link);
Log::info("Fetched " . $link->href);
Log::silly($link);
usleep(250000); // don't be too rude
};
};
Log::info("Finished fetching " . count($comicPages) . " pages");
return $comicPages;
};