Write RSS feed (correctly this time lol)

This commit is contained in:
Alexis Marie Wright 2022-03-14 01:26:41 -04:00
parent 90f17c68ca
commit 32eab358a6
8 changed files with 114 additions and 248 deletions

View File

@ -1,139 +0,0 @@
<?php
declare(strict_types = 1);
include_once('config/default.php');
use GuzzleHttp\Client as HTTPClient;
use PHPHtmlParser\Dom as DOM;
function fetchUrl(string $url) {
$client = new HTTPClient();
$res = $client->request(
'GET', $url, [
'http_errors' => false,
'headers' => [
'User-Agent' => Config::userAgent
]
]
);
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
};
return strval($res->getBody());
};
function fetchPages() {
$urls = [Config::baseURL];
$fetchedUrls = [];
$content = [];
while (count($urls) > 0) {
$url = array_shift($urls);
$res = fetchUrl($url);
Log::info("$url: fetched " . strlen($res) . " bytes");
Log::silly($res);
array_push($fetchedUrls, $url);
$dom = new DOM;
$dom->loadStr($res);
$pageNumber = null;
// extract page number (if any) from this page's URL
if ($url === Config::baseURL) {
Log::debug("$url is the root; will patch up page number later");
} else {
$matches = array();
if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
$pageNumber = $matches[1];
} else {
Log::debug("failed to find a page number in $url, trying special cases");
if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
$pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
}
elseif (preg_match('/cover13/', $url)) {
$pageNumber = 'Arc 13: Front Cover';
}
elseif (preg_match('/back12/', $url)) {
$pageNumber = 'Arc 12: Back Cover';
}
elseif (preg_match('/arc12splash/', $url)) {
$pageNumber = 'Arc 12: Front Cover';
}
elseif (preg_match('/arc11/', $url)) {
$pageNumber = 'Arc 11: Front Cover';
}
elseif (preg_match('/back10/', $url)) {
$pageNumber = 'Arc 10: Back Cover';
}
elseif (preg_match('/splash10/', $url)) {
$pageNumber = 'Arc 10: Cover';
}
elseif (preg_match('/arc9/', $url)) {
$pageNumber = 'Arc 9: Cover';
}
elseif (preg_match('/arc8/', $url)) {
$pageNumber = 'Arc 8: Cover';
};
};
if ($pageNumber === null) {
throw new Exception("failed to find a page number in $url");
};
};
array_push($content, [
'url' => $url,
'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
'body' => $dom,
'pageNumber' => $pageNumber
]);
// extract next page URL from this page's content
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
break;
};
$nextUrl = Config::baseURL . '/' . $el->href;
if (array_search($nextUrl, $fetchedUrls)) {
Log::debug("$url: already fetched next link $nextUrl");
continue;
};
if (array_search($nextUrl, $urls)) {
Log::debug("$url: duplicate next link $nextUrl");
continue;
};
Log::info("$url: next link $nextUrl");
array_push($urls, $nextUrl);
};
usleep(500000); // wait half a second, to be minimally polite
};
// patch up the page number on the title page by adding 1 to the
// first numeric page number we find, starting from the newest
// non-root page
foreach (array_slice($content, 1) as $page) {
if (preg_match('/^\d+$/', $page['pageNumber'])) {
$content[0]['pageNumber'] = $page['pageNumber'] + 1;
Log::info("patched up root page number to " . $content[0]['pageNumber']);
break;
};
};
return $content;
}

View File

@ -0,0 +1,82 @@
<?php
declare(strict_types = 1);
include_once('config/default.php');
include_once('lib/log.php');
include_once('lib/fetch-url.php');
class ComicPage {
public string $href = '';
public string $pageNo = '';
public string $title = '';
public string $imageUrl = '';
function __construct() {}
function __toString() {
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
}
};
function fetchSiteContent() {
$comicPages = array();
$archivePage = fetchUrl(Config::archiveUrl);
foreach ($archivePage->find('div.textcontent ul') as $arc) {
$arcHtml = $arc->innerHTML;
$comicLinks = $arc->find('a');
foreach ($comicLinks as $el) {
$link = new ComicPage;
$link->href = Config::baseUrl . "/" . $el->href;
$link->pageNo = $el->innerText;
// we'll get the image URL from this later, but fetch it early
// in order to check that the page exists; if not, we'll
// exclude it from the feed
try {
$comicPage = fetchUrl($link->href);
} catch (Exception $exc) {
if ($exc->response->getStatusCode() === 404) {
Log::warn("Got HTTP 404 from {$link->href}; skipping");
continue;
};
// rethrow unrecognized error
throw $exc;
};
// "why not a regex?" well buddy i don't know if you've ever
// seen regexes but string walking is kinder to the brainmeats
$start = strpos($arcHtml, $el->href);
do {
$start += 1;
} while (substr($arcHtml, $start, 4) != '</a>');
$start += 4; // '</a>'
$start += 4; // ' // '
$end = strpos($arcHtml, '<br', $start);
if (!$end) {
$end = strpos($arcHtml, '</', $start);
};
if (!$end) {
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
};
$link->title = trim(substr($arcHtml, $start, $end - $start));
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
array_push($comicPages, $link);
Log::debug($link);
usleep(250000); // don't be too rude
};
};
Log::info("Finished fetching " . count($comicPages) . " pages");
return $comicPages;
};

View File

@ -9,26 +9,34 @@ function generateFeed($content) {
'autoescape' => false
]);
$now = date('c');
$ts = time();
$items = [];
foreach ($content as $result) {
foreach ($content as $page) {
$ts -= 1; // artificially enforce ordering on undated historical items
$now = date('c', $ts);
$content = $twig->render('item-content.html', [
'url' => $page->href,
'imageUrl' => $page->imageUrl
]);
$items[] = $twig->render('item.xml', [
'title' => $result['pageNumber'] . " | " . Config::feed['title'],
'url' => $result['url'],
'imageUrl' => $result['imageUrl'],
'date' => $now
'title' => htmlspecialchars($page->pageNo . " // " . $page->title),
'date' => $now,
'content' => htmlspecialchars($content)
]);
};
$feed = $twig->render('feed.xml', [
'title' => Config::feed['title'],
'description' => Config::feed['description'],
'baseUrl' => Config::baseURL,
'baseUrl' => Config::baseUrl,
'feedUrl' => Config::feed['feedUrl'],
'date' => $now,
'items' => implode("\n", $items)
]);
return $feed;
};
};

View File

@ -1,2 +0,0 @@
<?php
declare(strict_types = 1);

View File

@ -10,85 +10,10 @@ require_once('vendor/autoload.php');
include_once('config/default.php');
include_once('lib/log.php');
// include_once('lib/fetch-pages.php');
// include_once('lib/generate-feed.php');
include_once('lib/fetch-site-content.php');
// $content = fetchPages();
// $feedXml = generateFeed($content);
// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
include_once('lib/fetch-url.php');
class ComicPage {
public string $href = '';
public string $pageNo = '';
public string $title = '';
public string $imageUrl = '';
function __construct() {}
function __toString() {
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
}
};
$comicPages = array();
$archivePage = fetchUrl(Config::archiveUrl);
foreach ($archivePage->find('div.textcontent ul') as $arc) {
$arcHtml = $arc->innerHTML;
$comicLinks = $arc->find('a');
foreach ($comicLinks as $el) {
$link = new ComicPage;
$link->href = Config::baseUrl . "/" . $el->href;
$link->pageNo = $el->innerText;
// we'll get the image URL from this later, but fetch it early
// in order to check that the page exists; if not, we'll
// exclude it from the feed
try {
$comicPage = fetchUrl($link->href);
} catch (Exception $exc) {
if ($exc->response->getStatusCode() === 404) {
Log::warn("Got HTTP 404 from {$link->href}; skipping");
continue;
};
// rethrow unrecognized error
throw $exc;
};
// "why not a regex?" well buddy i don't know if you've ever
// seen regexes but string walking is kinder to the brainmeats
$start = strpos($arcHtml, $el->href);
do {
$start += 1;
} while (substr($arcHtml, $start, 4) != '</a>');
$start += 4; // '</a>'
$start += 4; // ' // '
$end = strpos($arcHtml, '<br', $start);
if (!$end) {
$end = strpos($arcHtml, '</', $start);
};
if (!$end) {
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
};
$link->title = trim(substr($arcHtml, $start, $end - $start));
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
array_push($comicPages, $link);
Log::debug($link);
usleep(250000); // don't be too rude
};
};
$content = fetchSiteContent();
$feedXml = generateFeed(array_reverse($content));
Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
file_put_contents(Config::feedPath, $feedXml, LOCK_EX);

View File

@ -1,24 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
{% autoescape %}
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>
{{ title }}
</title>
<description>
{{ description }}
</description>
<title>{{ title }}</title>
<description>{{ description }}</description>
<link>{{ baseUrl }}</link>
<atom:link href="{{ feedUrl }}" rel="self" type="application/rss+xml" />
<pubDate>{{ date }}</pubDate>
<lastBuildDate>{{ date }}</lastBuildDate>
<generator>{{ date }}</generator>
<items>
{% endautoescape %}
{{ items }}
</items>
{{ items }}
{% autoescape %}
</channel>
</rss>
{% endautoescape %}

View File

@ -0,0 +1,3 @@
<a style="border: 0; text-decoration: none;" href="{{ url }}">
<img src="{{ imageUrl }}" />
</a>

View File

@ -1,10 +1,6 @@
<item>
<title>{{ title }}</title>
<description>
<a style="border: 0; text-decoration: none;" href="{{ url }}">
<img src="{{ imageUrl }}" />
</a>
</description>
<description>{{ content }}</description>
<pubDate>{{ date }}</pubDate>
<link>{{ url }}</link>
<guid isPermaLink="true">{{ url }}</guid>