diff --git a/lib/fetch-pages.php b/lib/fetch-pages.php
deleted file mode 100644
index 6527c9d..0000000
--- a/lib/fetch-pages.php
+++ /dev/null
@@ -1,139 +0,0 @@
-request(
- 'GET', $url, [
- 'http_errors' => false,
- 'headers' => [
- 'User-Agent' => Config::userAgent
- ]
- ]
- );
-
- if ($res->getStatusCode() !== 200) {
- Log::debug($res->getBody());
- throw "Request for $url returned {$res->getStatusCode()}";
- };
-
- return strval($res->getBody());
-};
-
-function fetchPages() {
- $urls = [Config::baseURL];
- $fetchedUrls = [];
- $content = [];
-
- while (count($urls) > 0) {
- $url = array_shift($urls);
-
- $res = fetchUrl($url);
- Log::info("$url: fetched " . strlen($res) . " bytes");
-
- Log::silly($res);
-
- array_push($fetchedUrls, $url);
-
- $dom = new DOM;
- $dom->loadStr($res);
-
- $pageNumber = null;
-
- // extract page number (if any) from this page's URL
- if ($url === Config::baseURL) {
- Log::debug("$url is the root; will patch up page number later");
- } else {
- $matches = array();
-
- if (preg_match('/comic(\d+)\.php$/', $url, $matches)) {
- $pageNumber = $matches[1];
- } else {
- Log::debug("failed to find a page number in $url, trying special cases");
-
- if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) {
- $pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover";
- }
- elseif (preg_match('/cover13/', $url)) {
- $pageNumber = 'Arc 13: Front Cover';
- }
- elseif (preg_match('/back12/', $url)) {
- $pageNumber = 'Arc 12: Back Cover';
- }
- elseif (preg_match('/arc12splash/', $url)) {
- $pageNumber = 'Arc 12: Front Cover';
- }
- elseif (preg_match('/arc11/', $url)) {
- $pageNumber = 'Arc 11: Front Cover';
- }
- elseif (preg_match('/back10/', $url)) {
- $pageNumber = 'Arc 10: Back Cover';
- }
- elseif (preg_match('/splash10/', $url)) {
- $pageNumber = 'Arc 10: Cover';
- }
- elseif (preg_match('/arc9/', $url)) {
- $pageNumber = 'Arc 9: Cover';
- }
- elseif (preg_match('/arc8/', $url)) {
- $pageNumber = 'Arc 8: Cover';
- };
- };
-
- if ($pageNumber === null) {
- throw new Exception("failed to find a page number in $url");
- };
- };
-
- array_push($content, [
- 'url' => $url,
- 'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src,
- 'body' => $dom,
- 'pageNumber' => $pageNumber
- ]);
-
- // extract next page URL from this page's content
- foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
- if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
- Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
- break;
- };
-
- $nextUrl = Config::baseURL . '/' . $el->href;
-
- if (array_search($nextUrl, $fetchedUrls)) {
- Log::debug("$url: already fetched next link $nextUrl");
- continue;
- };
-
- if (array_search($nextUrl, $urls)) {
- Log::debug("$url: duplicate next link $nextUrl");
- continue;
- };
-
- Log::info("$url: next link $nextUrl");
- array_push($urls, $nextUrl);
- };
-
- usleep(500000); // wait half a second, to be minimally polite
- };
-
- // patch up the page number on the title page by adding 1 to the
- // first numeric page number we find, starting from the newest
- // non-root page
- foreach (array_slice($content, 1) as $page) {
- if (preg_match('/^\d+$/', $page['pageNumber'])) {
- $content[0]['pageNumber'] = $page['pageNumber'] + 1;
- Log::info("patched up root page number to " . $content[0]['pageNumber']);
- break;
- };
- };
-
- return $content;
-}
\ No newline at end of file
diff --git a/lib/fetch-site-content.php b/lib/fetch-site-content.php
new file mode 100644
index 0000000..42434cb
--- /dev/null
+++ b/lib/fetch-site-content.php
@@ -0,0 +1,82 @@
+href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
+ }
+};
+
+function fetchSiteContent() {
+ $comicPages = array();
+
+ $archivePage = fetchUrl(Config::archiveUrl);
+
+ foreach ($archivePage->find('div.textcontent ul') as $arc) {
+ $arcHtml = $arc->innerHTML;
+
+ $comicLinks = $arc->find('a');
+ foreach ($comicLinks as $el) {
+ $link = new ComicPage;
+
+ $link->href = Config::baseUrl . "/" . $el->href;
+ $link->pageNo = $el->innerText;
+
+ // we'll get the image URL from this later, but fetch it early
+ // in order to check that the page exists; if not, we'll
+ // exclude it from the feed
+ try {
+ $comicPage = fetchUrl($link->href);
+ } catch (Exception $exc) {
+ if ($exc->response->getStatusCode() === 404) {
+ Log::warn("Got HTTP 404 from {$link->href}; skipping");
+ continue;
+ };
+ // rethrow unrecognized error
+ throw $exc;
+ };
+
+ // "why not a regex?" well buddy i don't know if you've ever
+ // seen regexes but string walking is kinder to the brainmeats
+ $start = strpos($arcHtml, $el->href);
+ do {
+ $start += 1;
+ } while (substr($arcHtml, $start, 4) != '');
+
+ $start += 4; // ''
+ $start += 4; // ' // '
+
+ $end = strpos($arcHtml, '
href})");
+ };
+
+ $link->title = trim(substr($arcHtml, $start, $end - $start));
+
+ $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
+
+ array_push($comicPages, $link);
+ Log::debug($link);
+
+ usleep(250000); // don't be too rude
+ };
+ };
+
+ Log::info("Finished fetching " . count($comicPages) . " pages");
+ return $comicPages;
+};
\ No newline at end of file
diff --git a/lib/generate-feed.php b/lib/generate-feed.php
index 763b061..04b49d1 100644
--- a/lib/generate-feed.php
+++ b/lib/generate-feed.php
@@ -9,26 +9,34 @@ function generateFeed($content) {
'autoescape' => false
]);
- $now = date('c');
-
+ $ts = time();
+
$items = [];
- foreach ($content as $result) {
+
+ foreach ($content as $page) {
+ $ts -= 1; // artificially enforce ordering on undated historical items
+ $now = date('c', $ts);
+
+ $content = $twig->render('item-content.html', [
+ 'url' => $page->href,
+ 'imageUrl' => $page->imageUrl
+ ]);
+
$items[] = $twig->render('item.xml', [
- 'title' => $result['pageNumber'] . " | " . Config::feed['title'],
- 'url' => $result['url'],
- 'imageUrl' => $result['imageUrl'],
- 'date' => $now
+ 'title' => htmlspecialchars($page->pageNo . " // " . $page->title),
+ 'date' => $now,
+ 'content' => htmlspecialchars($content)
]);
};
$feed = $twig->render('feed.xml', [
'title' => Config::feed['title'],
'description' => Config::feed['description'],
- 'baseUrl' => Config::baseURL,
+ 'baseUrl' => Config::baseUrl,
'feedUrl' => Config::feed['feedUrl'],
'date' => $now,
'items' => implode("\n", $items)
]);
return $feed;
-};
\ No newline at end of file
+};
diff --git a/lib/parse-page.php b/lib/parse-page.php
deleted file mode 100644
index 1ce6fae..0000000
--- a/lib/parse-page.php
+++ /dev/null
@@ -1,2 +0,0 @@
-href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}";
- }
-};
-
-$comicPages = array();
-
-$archivePage = fetchUrl(Config::archiveUrl);
-
-foreach ($archivePage->find('div.textcontent ul') as $arc) {
- $arcHtml = $arc->innerHTML;
-
- $comicLinks = $arc->find('a');
- foreach ($comicLinks as $el) {
- $link = new ComicPage;
-
- $link->href = Config::baseUrl . "/" . $el->href;
- $link->pageNo = $el->innerText;
-
- // we'll get the image URL from this later, but fetch it early
- // in order to check that the page exists; if not, we'll
- // exclude it from the feed
- try {
- $comicPage = fetchUrl($link->href);
- } catch (Exception $exc) {
- if ($exc->response->getStatusCode() === 404) {
- Log::warn("Got HTTP 404 from {$link->href}; skipping");
- continue;
- };
- // rethrow unrecognized error
- throw $exc;
- };
-
- // "why not a regex?" well buddy i don't know if you've ever
- // seen regexes but string walking is kinder to the brainmeats
- $start = strpos($arcHtml, $el->href);
- do {
- $start += 1;
- } while (substr($arcHtml, $start, 4) != '');
-
- $start += 4; // ''
- $start += 4; // ' // '
-
- $end = strpos($arcHtml, '
href})");
- };
-
- $link->title = trim(substr($arcHtml, $start, $end - $start));
-
- $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src;
-
- array_push($comicPages, $link);
- Log::debug($link);
-
- usleep(250000); // don't be too rude
- };
-};
+$content = fetchSiteContent();
+$feedXml = generateFeed(array_reverse($content));
+Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
+file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
diff --git a/templates/feed.xml b/templates/feed.xml
index a46ce95..8ab585d 100644
--- a/templates/feed.xml
+++ b/templates/feed.xml
@@ -1,24 +1,17 @@
-
{% autoescape %}
+
-
- {{ title }}
-
-
- {{ description }}
-
+ {{ title }}
+ {{ description }}
{{ baseUrl }}
{{ date }}
{{ date }}
{{ date }}
-
-
{% endautoescape %}
- {{ items }}
-
-
+ {{ items }}
+{% autoescape %}
-
+{% endautoescape %}
diff --git a/templates/item-content.html b/templates/item-content.html
new file mode 100644
index 0000000..75c0e6e
--- /dev/null
+++ b/templates/item-content.html
@@ -0,0 +1,3 @@
+
+
+
diff --git a/templates/item.xml b/templates/item.xml
index ba6ba79..b3dc586 100644
--- a/templates/item.xml
+++ b/templates/item.xml
@@ -1,10 +1,6 @@
-
{{ title }}
-
-
-
-
-
+ {{ content }}
{{ date }}
{{ url }}
{{ url }}