Write RSS feed (correctly this time lol)
This commit is contained in:
parent
90f17c68ca
commit
32eab358a6
|
@ -1,139 +0,0 @@
|
|||
<?php
|
||||
declare(strict_types = 1);
|
||||
|
||||
include_once('config/default.php');
|
||||
|
||||
use GuzzleHttp\Client as HTTPClient;
|
||||
use PHPHtmlParser\Dom as DOM;
|
||||
|
||||
function fetchUrl(string $url) {
|
||||
$client = new HTTPClient();
|
||||
|
||||
$res = $client->request(
|
||||
'GET', $url, [
|
||||
'http_errors' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => Config::userAgent
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
if ($res->getStatusCode() !== 200) {
|
||||
Log::debug($res->getBody());
|
||||
throw "Request for $url returned {$res->getStatusCode()}";
|
||||
};
|
||||
|
||||
return strval($res->getBody());
|
||||
};
|
||||
|
||||
function fetchPages() {
|
||||
$urls = [Config::baseURL];
|
||||
$fetchedUrls = [];
|
||||
$content = [];
|
||||
|
||||
while (count($urls) > 0) {
|
||||
$url = array_shift($urls);
|
||||
|
||||
$res = fetchUrl($url);
|
||||
Log::info("$url: fetched " . strlen($res) . " bytes");
|
||||
|
||||
Log::silly($res);
|
||||
|
||||
array_push($fetchedUrls, $url);
|
||||
|
||||
$dom = new DOM;
|
||||
$dom->loadStr($res);
|
||||
|
||||
$pageNumber = null;
|
||||
|
||||
// extract page number (if any) from this page's URL
|
||||
if ($url === Config::baseURL) {
|
||||
Log::debug("$url is the root; will patch up page number later");
|
||||
} else {
|
||||
$matches = array();
|
||||
|
||||
if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
|
||||
$pageNumber = $matches[1];
|
||||
} else {
|
||||
Log::debug("failed to find a page number in $url, trying special cases");
|
||||
|
||||
if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
|
||||
$pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
|
||||
}
|
||||
elseif (preg_match('/cover13/', $url)) {
|
||||
$pageNumber = 'Arc 13: Front Cover';
|
||||
}
|
||||
elseif (preg_match('/back12/', $url)) {
|
||||
$pageNumber = 'Arc 12: Back Cover';
|
||||
}
|
||||
elseif (preg_match('/arc12splash/', $url)) {
|
||||
$pageNumber = 'Arc 12: Front Cover';
|
||||
}
|
||||
elseif (preg_match('/arc11/', $url)) {
|
||||
$pageNumber = 'Arc 11: Front Cover';
|
||||
}
|
||||
elseif (preg_match('/back10/', $url)) {
|
||||
$pageNumber = 'Arc 10: Back Cover';
|
||||
}
|
||||
elseif (preg_match('/splash10/', $url)) {
|
||||
$pageNumber = 'Arc 10: Cover';
|
||||
}
|
||||
elseif (preg_match('/arc9/', $url)) {
|
||||
$pageNumber = 'Arc 9: Cover';
|
||||
}
|
||||
elseif (preg_match('/arc8/', $url)) {
|
||||
$pageNumber = 'Arc 8: Cover';
|
||||
};
|
||||
};
|
||||
|
||||
if ($pageNumber === null) {
|
||||
throw new Exception("failed to find a page number in $url");
|
||||
};
|
||||
};
|
||||
|
||||
array_push($content, [
|
||||
'url' => $url,
|
||||
'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
|
||||
'body' => $dom,
|
||||
'pageNumber' => $pageNumber
|
||||
]);
|
||||
|
||||
// extract next page URL from this page's content
|
||||
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
|
||||
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
|
||||
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
|
||||
break;
|
||||
};
|
||||
|
||||
$nextUrl = Config::baseURL . '/' . $el->href;
|
||||
|
||||
if (array_search($nextUrl, $fetchedUrls)) {
|
||||
Log::debug("$url: already fetched next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
if (array_search($nextUrl, $urls)) {
|
||||
Log::debug("$url: duplicate next link $nextUrl");
|
||||
continue;
|
||||
};
|
||||
|
||||
Log::info("$url: next link $nextUrl");
|
||||
array_push($urls, $nextUrl);
|
||||
};
|
||||
|
||||
usleep(500000); // wait half a second, to be minimally polite
|
||||
};
|
||||
|
||||
// patch up the page number on the title page by adding 1 to the
|
||||
// first numeric page number we find, starting from the newest
|
||||
// non-root page
|
||||
foreach (array_slice($content, 1) as $page) {
|
||||
if (preg_match('/^\d+$/', $page['pageNumber'])) {
|
||||
$content[0]['pageNumber'] = $page['pageNumber'] + 1;
|
||||
Log::info("patched up root page number to " . $content[0]['pageNumber']);
|
||||
break;
|
||||
};
|
||||
};
|
||||
|
||||
return $content;
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
<?php
|
||||
declare(strict_types = 1);
|
||||
|
||||
include_once('config/default.php');
|
||||
|
||||
include_once('lib/log.php');
|
||||
include_once('lib/fetch-url.php');
|
||||
|
||||
class ComicPage {
|
||||
public string $href = '';
|
||||
public string $pageNo = '';
|
||||
public string $title = '';
|
||||
public string $imageUrl = '';
|
||||
|
||||
function __construct() {}
|
||||
function __toString() {
|
||||
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
|
||||
}
|
||||
};
|
||||
|
||||
function fetchSiteContent() {
|
||||
$comicPages = array();
|
||||
|
||||
$archivePage = fetchUrl(Config::archiveUrl);
|
||||
|
||||
foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
||||
$arcHtml = $arc->innerHTML;
|
||||
|
||||
$comicLinks = $arc->find('a');
|
||||
foreach ($comicLinks as $el) {
|
||||
$link = new ComicPage;
|
||||
|
||||
$link->href = Config::baseUrl . "/" . $el->href;
|
||||
$link->pageNo = $el->innerText;
|
||||
|
||||
// we'll get the image URL from this later, but fetch it early
|
||||
// in order to check that the page exists; if not, we'll
|
||||
// exclude it from the feed
|
||||
try {
|
||||
$comicPage = fetchUrl($link->href);
|
||||
} catch (Exception $exc) {
|
||||
if ($exc->response->getStatusCode() === 404) {
|
||||
Log::warn("Got HTTP 404 from {$link->href}; skipping");
|
||||
continue;
|
||||
};
|
||||
// rethrow unrecognized error
|
||||
throw $exc;
|
||||
};
|
||||
|
||||
// "why not a regex?" well buddy i don't know if you've ever
|
||||
// seen regexes but string walking is kinder to the brainmeats
|
||||
$start = strpos($arcHtml, $el->href);
|
||||
do {
|
||||
$start += 1;
|
||||
} while (substr($arcHtml, $start, 4) != '</a>');
|
||||
|
||||
$start += 4; // '</a>'
|
||||
$start += 4; // ' // '
|
||||
|
||||
$end = strpos($arcHtml, '<br', $start);
|
||||
if (!$end) {
|
||||
$end = strpos($arcHtml, '</', $start);
|
||||
};
|
||||
|
||||
if (!$end) {
|
||||
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
|
||||
};
|
||||
|
||||
$link->title = trim(substr($arcHtml, $start, $end - $start));
|
||||
|
||||
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
|
||||
|
||||
array_push($comicPages, $link);
|
||||
Log::debug($link);
|
||||
|
||||
usleep(250000); // don't be too rude
|
||||
};
|
||||
};
|
||||
|
||||
Log::info("Finished fetching " . count($comicPages) . " pages");
|
||||
return $comicPages;
|
||||
};
|
|
@ -9,26 +9,34 @@ function generateFeed($content) {
|
|||
'autoescape' => false
|
||||
]);
|
||||
|
||||
$now = date('c');
|
||||
|
||||
$ts = time();
|
||||
|
||||
$items = [];
|
||||
foreach ($content as $result) {
|
||||
|
||||
foreach ($content as $page) {
|
||||
$ts -= 1; // artificially enforce ordering on undated historical items
|
||||
$now = date('c', $ts);
|
||||
|
||||
$content = $twig->render('item-content.html', [
|
||||
'url' => $page->href,
|
||||
'imageUrl' => $page->imageUrl
|
||||
]);
|
||||
|
||||
$items[] = $twig->render('item.xml', [
|
||||
'title' => $result['pageNumber'] . " | " . Config::feed['title'],
|
||||
'url' => $result['url'],
|
||||
'imageUrl' => $result['imageUrl'],
|
||||
'date' => $now
|
||||
'title' => htmlspecialchars($page->pageNo . " // " . $page->title),
|
||||
'date' => $now,
|
||||
'content' => htmlspecialchars($content)
|
||||
]);
|
||||
};
|
||||
|
||||
$feed = $twig->render('feed.xml', [
|
||||
'title' => Config::feed['title'],
|
||||
'description' => Config::feed['description'],
|
||||
'baseUrl' => Config::baseURL,
|
||||
'baseUrl' => Config::baseUrl,
|
||||
'feedUrl' => Config::feed['feedUrl'],
|
||||
'date' => $now,
|
||||
'items' => implode("\n", $items)
|
||||
]);
|
||||
|
||||
return $feed;
|
||||
};
|
||||
};
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
<?php
|
||||
declare(strict_types = 1);
|
|
@ -10,85 +10,10 @@ require_once('vendor/autoload.php');
|
|||
include_once('config/default.php');
|
||||
|
||||
include_once('lib/log.php');
|
||||
// include_once('lib/fetch-pages.php');
|
||||
// include_once('lib/generate-feed.php');
|
||||
include_once('lib/fetch-site-content.php');
|
||||
|
||||
// $content = fetchPages();
|
||||
// $feedXml = generateFeed($content);
|
||||
|
||||
// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
|
||||
// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
|
||||
|
||||
|
||||
include_once('lib/fetch-url.php');
|
||||
|
||||
class ComicPage {
|
||||
public string $href = '';
|
||||
public string $pageNo = '';
|
||||
public string $title = '';
|
||||
public string $imageUrl = '';
|
||||
|
||||
function __construct() {}
|
||||
function __toString() {
|
||||
return "[{$this->href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
|
||||
}
|
||||
};
|
||||
|
||||
$comicPages = array();
|
||||
|
||||
$archivePage = fetchUrl(Config::archiveUrl);
|
||||
|
||||
foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
||||
$arcHtml = $arc->innerHTML;
|
||||
|
||||
$comicLinks = $arc->find('a');
|
||||
foreach ($comicLinks as $el) {
|
||||
$link = new ComicPage;
|
||||
|
||||
$link->href = Config::baseUrl . "/" . $el->href;
|
||||
$link->pageNo = $el->innerText;
|
||||
|
||||
// we'll get the image URL from this later, but fetch it early
|
||||
// in order to check that the page exists; if not, we'll
|
||||
// exclude it from the feed
|
||||
try {
|
||||
$comicPage = fetchUrl($link->href);
|
||||
} catch (Exception $exc) {
|
||||
if ($exc->response->getStatusCode() === 404) {
|
||||
Log::warn("Got HTTP 404 from {$link->href}; skipping");
|
||||
continue;
|
||||
};
|
||||
// rethrow unrecognized error
|
||||
throw $exc;
|
||||
};
|
||||
|
||||
// "why not a regex?" well buddy i don't know if you've ever
|
||||
// seen regexes but string walking is kinder to the brainmeats
|
||||
$start = strpos($arcHtml, $el->href);
|
||||
do {
|
||||
$start += 1;
|
||||
} while (substr($arcHtml, $start, 4) != '</a>');
|
||||
|
||||
$start += 4; // '</a>'
|
||||
$start += 4; // ' // '
|
||||
|
||||
$end = strpos($arcHtml, '<br', $start);
|
||||
if (!$end) {
|
||||
$end = strpos($arcHtml, '</', $start);
|
||||
};
|
||||
|
||||
if (!$end) {
|
||||
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
|
||||
};
|
||||
|
||||
$link->title = trim(substr($arcHtml, $start, $end - $start));
|
||||
|
||||
$link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
|
||||
|
||||
array_push($comicPages, $link);
|
||||
Log::debug($link);
|
||||
|
||||
usleep(250000); // don't be too rude
|
||||
};
|
||||
};
|
||||
$content = fetchSiteContent();
|
||||
$feedXml = generateFeed(array_reverse($content));
|
||||
|
||||
Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
|
||||
file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
|
||||
|
|
|
@ -1,24 +1,17 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
{% autoescape %}
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>
|
||||
{{ title }}
|
||||
</title>
|
||||
<description>
|
||||
{{ description }}
|
||||
</description>
|
||||
<title>{{ title }}</title>
|
||||
<description>{{ description }}</description>
|
||||
<link>{{ baseUrl }}</link>
|
||||
<atom:link href="{{ feedUrl }}" rel="self" type="application/rss+xml" />
|
||||
<pubDate>{{ date }}</pubDate>
|
||||
<lastBuildDate>{{ date }}</lastBuildDate>
|
||||
<generator>{{ date }}</generator>
|
||||
<items>
|
||||
|
||||
{% endautoescape %}
|
||||
{{ items }}
|
||||
|
||||
</items>
|
||||
{{ items }}
|
||||
{% autoescape %}
|
||||
</channel>
|
||||
|
||||
</rss>
|
||||
{% endautoescape %}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
<a style="border: 0; text-decoration: none;" href="{{ url }}">
|
||||
<img src="{{ imageUrl }}" />
|
||||
</a>
|
|
@ -1,10 +1,6 @@
|
|||
<item>
|
||||
<title>{{ title }}</title>
|
||||
<description>
|
||||
<a style="border: 0; text-decoration: none;" href="{{ url }}">
|
||||
<img src="{{ imageUrl }}" />
|
||||
</a>
|
||||
</description>
|
||||
<description>{{ content }}</description>
|
||||
<pubDate>{{ date }}</pubDate>
|
||||
<link>{{ url }}</link>
|
||||
<guid isPermaLink="true">{{ url }}</guid>
|
||||
|
|
Loading…
Reference in New Issue