Read + parse archive page

This commit is contained in:
Alexis Marie Wright 2022-03-13 21:41:45 -04:00
parent d7fd21f338
commit 8d31c7ea29
3 changed files with 90 additions and 10 deletions

View File

@ -10,13 +10,14 @@ class Config {
// where to write the feed file
const feedPath = '/tmp/feed.xml';
// where to start spidering
const baseURL = "http://www.inhuman-comic.com";
// base URL of the site
const baseUrl = "http://www.inhuman-comic.com";
// archive page URL
const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
// how the script identifies itself to the server while spidering
// (this will also be used as the "Generator" value in the feed XML)
const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
// a CSS selector identifying the "next page" link to follow while spidering
const nextLinkSelector = 'div.body a.prev';
// which log levels to emit (true) or suppress (false)
// nothing below 'info' should be important in normal operation
const logLevels = [

29
lib/fetch-url.php Normal file
View File

@ -0,0 +1,29 @@
<?php
declare(strict_types = 1);
include_once('config/default.php');
use GuzzleHttp\Client as HTTPClient;
use PHPHtmlParser\Dom as DOM;
function fetchUrl(string $url) {
$client = new HTTPClient();
$res = $client->request(
'GET', $url, [
'http_errors' => false,
'headers' => [
'User-Agent' => Config::userAgent
]
]
);
if ($res->getStatusCode() !== 200) {
Log::debug($res->getBody());
throw "Request for $url returned {$res->getStatusCode()}";
};
$dom = new DOM;
$dom->loadStr(strval($res->getBody()));
return $dom;
};

View File

@ -10,11 +10,61 @@ require_once('vendor/autoload.php');
include_once('config/default.php');
include_once('lib/log.php');
include_once('lib/fetch-pages.php');
include_once('lib/generate-feed.php');
// include_once('lib/fetch-pages.php');
// include_once('lib/generate-feed.php');
$content = fetchPages();
$feedXml = generateFeed($content);
// $content = fetchPages();
// $feedXml = generateFeed($content);
// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
include_once('lib/fetch-url.php');
$archivePage = fetchUrl(Config::archiveUrl);
class ComicPageLink {
public string $href = '';
public string $pageNo = '';
public string $title = '';
function __construct() {}
function __toString() {
return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
}
};
$comicPageLinks = array();
foreach ($archivePage->find('div.textcontent ul') as $arc) {
$arcHtml = $arc->innerHTML;
$comicLinks = $arc->find('a');
foreach ($comicLinks as $el) {
$link = new ComicPageLink;
$start = strpos($arcHtml, $el->href);
do {
$start += 1;
} while (substr($arcHtml, $start, 4) != '</a>');
$start += 4; // '</a>'
$start += 4; // ' // '
$end = strpos($arcHtml, '<br', $start);
if (!$end) {
$end = strpos($arcHtml, '</', $start);
};
if (!$end) {
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
};
$link->title = trim(substr($arcHtml, $start, $end - $start));
$link->href = Config::baseUrl . "/" . $el->href;
$link->pageNo = $el->innerText;
print $link;
};
};
Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
file_put_contents(Config::feedPath, $feedXml, LOCK_EX);