Read + parse archive page
This commit is contained in:
parent
d7fd21f338
commit
8d31c7ea29
|
@ -10,13 +10,14 @@ class Config {
|
|||
// where to write the feed file
|
||||
const feedPath = '/tmp/feed.xml';
|
||||
|
||||
// where to start spidering
|
||||
const baseURL = "http://www.inhuman-comic.com";
|
||||
// base URL of the site
|
||||
const baseUrl = "http://www.inhuman-comic.com";
|
||||
// archive page URL
|
||||
const archiveUrl = 'http://www.inhuman-comic.com/archives.php';
|
||||
|
||||
// how the script identifies itself to the server while spidering
|
||||
// (this will also be used as the "Generator" value in the feed XML)
|
||||
const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
|
||||
// a CSS selector identifying the "next page" link to follow while spidering
|
||||
const nextLinkSelector = 'div.body a.prev';
|
||||
// which log levels to emit (true) or suppress (false)
|
||||
// nothing below 'info' should be important in normal operation
|
||||
const logLevels = [
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
<?php
|
||||
declare(strict_types = 1);
|
||||
|
||||
include_once('config/default.php');
|
||||
|
||||
use GuzzleHttp\Client as HTTPClient;
|
||||
use PHPHtmlParser\Dom as DOM;
|
||||
|
||||
function fetchUrl(string $url) {
|
||||
$client = new HTTPClient();
|
||||
|
||||
$res = $client->request(
|
||||
'GET', $url, [
|
||||
'http_errors' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => Config::userAgent
|
||||
]
|
||||
]
|
||||
);
|
||||
|
||||
if ($res->getStatusCode() !== 200) {
|
||||
Log::debug($res->getBody());
|
||||
throw "Request for $url returned {$res->getStatusCode()}";
|
||||
};
|
||||
|
||||
$dom = new DOM;
|
||||
$dom->loadStr(strval($res->getBody()));
|
||||
return $dom;
|
||||
};
|
|
@ -10,11 +10,61 @@ require_once('vendor/autoload.php');
|
|||
include_once('config/default.php');
|
||||
|
||||
include_once('lib/log.php');
|
||||
include_once('lib/fetch-pages.php');
|
||||
include_once('lib/generate-feed.php');
|
||||
// include_once('lib/fetch-pages.php');
|
||||
// include_once('lib/generate-feed.php');
|
||||
|
||||
$content = fetchPages();
|
||||
$feedXml = generateFeed($content);
|
||||
// $content = fetchPages();
|
||||
// $feedXml = generateFeed($content);
|
||||
|
||||
// Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
|
||||
// file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
|
||||
|
||||
|
||||
include_once('lib/fetch-url.php');
|
||||
$archivePage = fetchUrl(Config::archiveUrl);
|
||||
|
||||
class ComicPageLink {
|
||||
public string $href = '';
|
||||
public string $pageNo = '';
|
||||
public string $title = '';
|
||||
function __construct() {}
|
||||
function __toString() {
|
||||
return "[{$this->href}] {$this->pageNo} // {$this->title}\n";
|
||||
}
|
||||
};
|
||||
|
||||
$comicPageLinks = array();
|
||||
|
||||
foreach ($archivePage->find('div.textcontent ul') as $arc) {
|
||||
$arcHtml = $arc->innerHTML;
|
||||
|
||||
$comicLinks = $arc->find('a');
|
||||
foreach ($comicLinks as $el) {
|
||||
$link = new ComicPageLink;
|
||||
|
||||
$start = strpos($arcHtml, $el->href);
|
||||
do {
|
||||
$start += 1;
|
||||
} while (substr($arcHtml, $start, 4) != '</a>');
|
||||
|
||||
$start += 4; // '</a>'
|
||||
$start += 4; // ' // '
|
||||
|
||||
$end = strpos($arcHtml, '<br', $start);
|
||||
if (!$end) {
|
||||
$end = strpos($arcHtml, '</', $start);
|
||||
};
|
||||
|
||||
if (!$end) {
|
||||
throw new Exception("Failed to find a title endpoint at {$el} ({$el->href})");
|
||||
};
|
||||
|
||||
$link->title = trim(substr($arcHtml, $start, $end - $start));
|
||||
|
||||
$link->href = Config::baseUrl . "/" . $el->href;
|
||||
$link->pageNo = $el->innerText;
|
||||
|
||||
print $link;
|
||||
};
|
||||
};
|
||||
|
||||
Log::info("writing " . strlen($feedXml) . " bytes to " . Config::feedPath);
|
||||
file_put_contents(Config::feedPath, $feedXml, LOCK_EX);
|
||||
|
|
Loading…
Reference in New Issue