Pull out config
This commit is contained in:
parent
eb9e9adf96
commit
778f54a283
|
@ -0,0 +1,21 @@
|
|||
<?php
|
||||
|
||||
namespace RSSMonster;
|
||||
|
||||
class Config {
|
||||
// where to start spidering
|
||||
const baseURL = "http://www.inhuman-comic.com";
|
||||
// how the script identifies itself to the server while spidering
|
||||
const userAgent = 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)';
|
||||
// a CSS selector identifying the "next page" link to follow while spidering
|
||||
const nextLinkSelector = 'div.body a.prev';
|
||||
// which log levels to emit (true) or suppress (false)
|
||||
// nothing below 'info' should be important in normal operation
|
||||
const logLevels = [
|
||||
'info' => true,
|
||||
'debug' => true,
|
||||
'silly' => false
|
||||
];
|
||||
// if set, fetch only this many pages while spidering (false to fetch all)
|
||||
const TEST_stopAfter = 10;
|
||||
}
|
|
@ -6,16 +6,14 @@ declare(strict_types = 1);
|
|||
error_reporting(E_ALL & ~E_DEPRECATED);
|
||||
|
||||
require('vendor/autoload.php');
|
||||
include('config/default.php');
|
||||
|
||||
use RSSMonster\Config as Config;
|
||||
use GuzzleHttp\Client as HTTPClient;
|
||||
use PHPHtmlParser\Dom as DOM;
|
||||
|
||||
class Log {
|
||||
private static $enabled = [
|
||||
'silly' => false,
|
||||
'debug' => true,
|
||||
'info' => true
|
||||
];
|
||||
private static $enabled = Config::logLevels;
|
||||
|
||||
private static function emit(string $level, $message) {
|
||||
if (!Log::$enabled[$level]) {
|
||||
|
@ -45,7 +43,7 @@ function fetchUrl(string $url) {
|
|||
'GET', $url, [
|
||||
'http_errors' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => 'Lexie\'s RSS Monster (for Cial) (lexie@alexis-marie-wright.me)'
|
||||
'User-Agent' => Config::userAgent
|
||||
]
|
||||
]
|
||||
);
|
||||
|
@ -58,10 +56,7 @@ function fetchUrl(string $url) {
|
|||
return strval($res->getBody());
|
||||
};
|
||||
|
||||
$baseUrl = 'http://inhuman-comic.com';
|
||||
|
||||
$nextLinkSelector = 'div.body a.prev';
|
||||
$urls = [$baseUrl];
|
||||
$urls = [Config::baseURL];
|
||||
$fetchedUrls = [];
|
||||
|
||||
while (count($urls) > 0) {
|
||||
|
@ -77,8 +72,13 @@ while (count($urls) > 0) {
|
|||
$dom = new DOM;
|
||||
$dom->loadStr($res);
|
||||
|
||||
foreach (array($dom->find($nextLinkSelector)) as $el) {
|
||||
$nextUrl = $baseUrl . '/' . $el->href;
|
||||
foreach (array($dom->find(Config::nextLinkSelector)) as $el) {
|
||||
if (Config::TEST_stopAfter && count($fetchedUrls) >= Config::TEST_stopAfter) {
|
||||
Log::debug("stopping retrieval after " . Config::TEST_stopAfter . " fetches");
|
||||
break;
|
||||
};
|
||||
|
||||
$nextUrl = Config::baseURL . '/' . $el->href;
|
||||
|
||||
if (array_search($nextUrl, $fetchedUrls)) {
|
||||
Log::debug("$url: already fetched next link $nextUrl");
|
||||
|
@ -96,4 +96,4 @@ while (count($urls) > 0) {
|
|||
|
||||
usleep(500000); // wait half a second, to be minimally polite
|
||||
};
|
||||
Log::info("Out of next links!");
|
||||
|
||||
|
|
Loading…
Reference in New Issue