diff --git a/lib/fetch-pages.php b/lib/fetch-pages.php deleted file mode 100644 index 6527c9d..0000000 --- a/lib/fetch-pages.php +++ /dev/null @@ -1,139 +0,0 @@ -request( - 'GET', $url, [ - 'http_errors' => false, - 'headers' => [ - 'User-Agent' => Config::userAgent - ] - ] - ); - - if ($res->getStatusCode() !== 200) { - Log::debug($res->getBody()); - throw "Request for $url returned {$res->getStatusCode()}"; - }; - - return strval($res->getBody()); -}; - -function fetchPages() { - $urls = [Config::baseURL]; - $fetchedUrls = []; - $content = []; - - while (count($urls) > 0) { - $url = array_shift($urls); - - $res = fetchUrl($url); - Log::info("$url: fetched " . strlen($res) . " bytes"); - - Log::silly($res); - - array_push($fetchedUrls, $url); - - $dom = new DOM; - $dom->loadStr($res); - - $pageNumber = null; - - // extract page number (if any) from this page's URL - if ($url === Config::baseURL) { - Log::debug("$url is the root; will patch up page number later"); - } else { - $matches = array(); - - if (preg_match('/comic(\d+)\.php$/', $url, $matches)) { - $pageNumber = $matches[1]; - } else { - Log::debug("failed to find a page number in $url, trying special cases"); - - if (preg_match('/arc(\d+)\-(front|back)/', $url, $matches)) { - $pageNumber = "Arc " . $matches[1] . ": " . ucfirst($matches[2]) . " Cover"; - } - elseif (preg_match('/cover13/', $url)) { - $pageNumber = 'Arc 13: Front Cover'; - } - elseif (preg_match('/back12/', $url)) { - $pageNumber = 'Arc 12: Back Cover'; - } - elseif (preg_match('/arc12splash/', $url)) { - $pageNumber = 'Arc 12: Front Cover'; - } - elseif (preg_match('/arc11/', $url)) { - $pageNumber = 'Arc 11: Front Cover'; - } - elseif (preg_match('/back10/', $url)) { - $pageNumber = 'Arc 10: Back Cover'; - } - elseif (preg_match('/splash10/', $url)) { - $pageNumber = 'Arc 10: Cover'; - } - elseif (preg_match('/arc9/', $url)) { - $pageNumber = 'Arc 9: Cover'; - } - elseif (preg_match('/arc8/', $url)) { - $pageNumber = 'Arc 8: Cover'; - }; - }; - - if ($pageNumber === null) { - throw new Exception("failed to find a page number in $url"); - }; - }; - - array_push($content, [ - 'url' => $url, - 'imageUrl' => Config::baseURL . '/' . $dom->find('div.page img')[0]->src, - 'body' => $dom, - 'pageNumber' => $pageNumber - ]); - - // extract next page URL from this page's content - foreach (array($dom->find(Config::nextLinkSelector)) as $el) { - if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) { - Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches"); - break; - }; - - $nextUrl = Config::baseURL . '/' . $el->href; - - if (array_search($nextUrl, $fetchedUrls)) { - Log::debug("$url: already fetched next link $nextUrl"); - continue; - }; - - if (array_search($nextUrl, $urls)) { - Log::debug("$url: duplicate next link $nextUrl"); - continue; - }; - - Log::info("$url: next link $nextUrl"); - array_push($urls, $nextUrl); - }; - - usleep(500000); // wait half a second, to be minimally polite - }; - - // patch up the page number on the title page by adding 1 to the - // first numeric page number we find, starting from the newest - // non-root page - foreach (array_slice($content, 1) as $page) { - if (preg_match('/^\d+$/', $page['pageNumber'])) { - $content[0]['pageNumber'] = $page['pageNumber'] + 1; - Log::info("patched up root page number to " . $content[0]['pageNumber']); - break; - }; - }; - - return $content; -} \ No newline at end of file diff --git a/lib/fetch-site-content.php b/lib/fetch-site-content.php new file mode 100644 index 0000000..42434cb --- /dev/null +++ b/lib/fetch-site-content.php @@ -0,0 +1,82 @@ +href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}"; + } +}; + +function fetchSiteContent() { + $comicPages = array(); + + $archivePage = fetchUrl(Config::archiveUrl); + + foreach ($archivePage->find('div.textcontent ul') as $arc) { + $arcHtml = $arc->innerHTML; + + $comicLinks = $arc->find('a'); + foreach ($comicLinks as $el) { + $link = new ComicPage; + + $link->href = Config::baseUrl . "/" . $el->href; + $link->pageNo = $el->innerText; + + // we'll get the image URL from this later, but fetch it early + // in order to check that the page exists; if not, we'll + // exclude it from the feed + try { + $comicPage = fetchUrl($link->href); + } catch (Exception $exc) { + if ($exc->response->getStatusCode() === 404) { + Log::warn("Got HTTP 404 from {$link->href}; skipping"); + continue; + }; + // rethrow unrecognized error + throw $exc; + }; + + // "why not a regex?" well buddy i don't know if you've ever + // seen regexes but string walking is kinder to the brainmeats + $start = strpos($arcHtml, $el->href); + do { + $start += 1; + } while (substr($arcHtml, $start, 4) != ''); + + $start += 4; // '' + $start += 4; // ' // ' + + $end = strpos($arcHtml, 'href})"); + }; + + $link->title = trim(substr($arcHtml, $start, $end - $start)); + + $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src; + + array_push($comicPages, $link); + Log::debug($link); + + usleep(250000); // don't be too rude + }; + }; + + Log::info("Finished fetching " . count($comicPages) . " pages"); + return $comicPages; +}; \ No newline at end of file diff --git a/lib/generate-feed.php b/lib/generate-feed.php index 763b061..04b49d1 100644 --- a/lib/generate-feed.php +++ b/lib/generate-feed.php @@ -9,26 +9,34 @@ function generateFeed($content) { 'autoescape' => false ]); - $now = date('c'); - + $ts = time(); + $items = []; - foreach ($content as $result) { + + foreach ($content as $page) { + $ts -= 1; // artificially enforce ordering on undated historical items + $now = date('c', $ts); + + $content = $twig->render('item-content.html', [ + 'url' => $page->href, + 'imageUrl' => $page->imageUrl + ]); + $items[] = $twig->render('item.xml', [ - 'title' => $result['pageNumber'] . " | " . Config::feed['title'], - 'url' => $result['url'], - 'imageUrl' => $result['imageUrl'], - 'date' => $now + 'title' => htmlspecialchars($page->pageNo . " // " . $page->title), + 'date' => $now, + 'content' => htmlspecialchars($content) ]); }; $feed = $twig->render('feed.xml', [ 'title' => Config::feed['title'], 'description' => Config::feed['description'], - 'baseUrl' => Config::baseURL, + 'baseUrl' => Config::baseUrl, 'feedUrl' => Config::feed['feedUrl'], 'date' => $now, 'items' => implode("\n", $items) ]); return $feed; -}; \ No newline at end of file +}; diff --git a/lib/parse-page.php b/lib/parse-page.php deleted file mode 100644 index 1ce6fae..0000000 --- a/lib/parse-page.php +++ /dev/null @@ -1,2 +0,0 @@ -href}] [{$this->imageUrl}] {$this->pageNo} // {$this->title}"; - } -}; - -$comicPages = array(); - -$archivePage = fetchUrl(Config::archiveUrl); - -foreach ($archivePage->find('div.textcontent ul') as $arc) { - $arcHtml = $arc->innerHTML; - - $comicLinks = $arc->find('a'); - foreach ($comicLinks as $el) { - $link = new ComicPage; - - $link->href = Config::baseUrl . "/" . $el->href; - $link->pageNo = $el->innerText; - - // we'll get the image URL from this later, but fetch it early - // in order to check that the page exists; if not, we'll - // exclude it from the feed - try { - $comicPage = fetchUrl($link->href); - } catch (Exception $exc) { - if ($exc->response->getStatusCode() === 404) { - Log::warn("Got HTTP 404 from {$link->href}; skipping"); - continue; - }; - // rethrow unrecognized error - throw $exc; - }; - - // "why not a regex?" well buddy i don't know if you've ever - // seen regexes but string walking is kinder to the brainmeats - $start = strpos($arcHtml, $el->href); - do { - $start += 1; - } while (substr($arcHtml, $start, 4) != ''); - - $start += 4; // '' - $start += 4; // ' // ' - - $end = strpos($arcHtml, 'href})"); - }; - - $link->title = trim(substr($arcHtml, $start, $end - $start)); - - $link->imageUrl = Config::baseUrl . '/' . $comicPage->find('div.page img')[0]->src; - - array_push($comicPages, $link); - Log::debug($link); - - usleep(250000); // don't be too rude - }; -}; +$content = fetchSiteContent(); +$feedXml = generateFeed(array_reverse($content)); +Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath); +file_put_contents(Config::feedPath, $feedXml, LOCK_EX); diff --git a/templates/feed.xml b/templates/feed.xml index a46ce95..8ab585d 100644 --- a/templates/feed.xml +++ b/templates/feed.xml @@ -1,24 +1,17 @@ - {% autoescape %} + - - {{ title }} - - - {{ description }} - + {{ title }} + {{ description }} {{ baseUrl }} {{ date }} {{ date }} {{ date }} - - {% endautoescape %} - {{ items }} - - + {{ items }} +{% autoescape %} - +{% endautoescape %} diff --git a/templates/item-content.html b/templates/item-content.html new file mode 100644 index 0000000..75c0e6e --- /dev/null +++ b/templates/item-content.html @@ -0,0 +1,3 @@ + + + diff --git a/templates/item.xml b/templates/item.xml index ba6ba79..b3dc586 100644 --- a/templates/item.xml +++ b/templates/item.xml @@ -1,10 +1,6 @@ {{ title }} - - - - - + {{ content }} {{ date }} {{ url }} {{ url }}