fix WHYY, switch to BeautifulSoup

requests_html fails to parse valid HTML in some cases (<p> without
closing </p>) and BeautifulSoup seems easier to use anyway.  Switching
to that for WHYY and maybe the rest down the road.
This commit is contained in:
epiii2 2022-09-06 20:01:21 +00:00
parent 86d25eeec3
commit 9e4ca52721
2 changed files with 233 additions and 7 deletions

View File

@ -2,6 +2,8 @@ import re
import sys
from pathlib import Path
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from requests import Session
from requests.exceptions import ReadTimeout, ConnectionError
PREFIX = """Gemini proxies of a few local news sources:"""
@ -41,7 +43,8 @@ def slug(txt):
"""Replace all non-alphanumeric characters with dashes."""
return re.sub("[^A-Za-z0-9]", "-", txt)
class Handler:
class HandlerOld:
"""The older Handler implementation, based on requests-html."""
def __init__(self, path):
self.session = HTMLSession()
@ -94,7 +97,7 @@ class Handler:
return link.replace(self.root_html, self.root_gmni)
class HandlerDefault(Handler):
class HandlerDefault(HandlerOld):
def parse(self):
self.output += PREFIX + "\r\n\r\n"
@ -104,7 +107,7 @@ class HandlerDefault(Handler):
self.output += SUFFIX + "\r\n"
class HandlerCitizen(Handler):
class HandlerCitizen(HandlerOld):
def __init__(self, path):
super().__init__(path)
@ -257,7 +260,7 @@ class HandlerCitizen(Handler):
# Though I just found: https://billypenn.com/wp-json/ !
# and now also https://thephiladelphiacitizen.org/wp-json !!
class HandlerBillyPenn(Handler):
class HandlerBillyPenn(HandlerOld):
def __init__(self, path):
# I don't know why my use of the status code 10 seems to do .../?query
@ -419,7 +422,7 @@ class HandlerBillyPenn(Handler):
return output
class HandlerWHYYNews(Handler):
class HandlerWHYYNewsOld(HandlerOld):
def __init__(self, path):
super().__init__(path)
@ -526,6 +529,201 @@ class HandlerWHYYNews(Handler):
return output
class Handler:
"""Page handler built on requests and BeautifulSoup instead of requests-html."""
def __init__(self, path):
self.session = Session()
self.path_base = re.sub("/.*$", "", path)
self.path = re.sub("^[^/?]+", "", path)
self.root_html = ""
self.root_gmni = ""
self.status_code = "20"
self.ctype = "text/gemini"
self.output = ""
def get(self):
try:
response = self.session.get(self.root_html + self.path, timeout=TIMEOUT)
except (ReadTimeout, ConnectionError) as error:
self.output = "Network error accessing:\r\n\r\n"
self.output += f"=> {self.root_gmni}{self.path}\r\n"
response = None
return response
def parse(self):
self.output = f"No handler defined for {self.path_base}\r\n"
self.output += "=> . Go back to the index\r\n"
def make_proxy_snippet(self):
links = [
(self.root_html + self.path, "HTML Original"),
(self.root_gmni, "Proxy Home Page")]
output = "\r\n".join([f"=> {lnk[0]} {lnk[1]}" for lnk in links])
output += "\r\n\r\n"
return output
def render(self):
sys.stdout.buffer.write(f"{self.status_code} {self.ctype}\r\n".encode())
if self.status_code == "10" or self.ctype.startswith("text/"):
sys.stdout.buffer.write(self.output.encode())
else:
sys.stdout.buffer.write(self.output)
def href(self, anchor, attrib="href"):
try:
link = anchor.attrs.get(attrib, "")
except AttributeError:
link = anchor
if not link or not self.root_html or not self.root_gmni:
return link
if link.startswith("/"):
return self.root_gmni + link
else:
return link.replace(self.root_html, self.root_gmni)
class HandlerWHYYNews(Handler):
def __init__(self, path):
super().__init__(path)
self.root_html = "https://whyy.org/"
self.root_gmni = ROOT + "whyy.org/"
def parse(self):
r = self.get()
if not r:
return
if r.headers["Content-Type"] in ["image/png", "image/jpeg"]:
self.ctype = "image/png"
self.output = r.content
return
output = ""
html = BeautifulSoup(r.content, features="lxml")
output += self.parse_main_title(html)
output += self.make_proxy_snippet()
output += self.parse_secondary_title(html)
actual_articles = html.find_all("article", class_="article")
if actual_articles:
output += self.parse_article(html)
elif html.find_all("article", class_="npr_story_post"):
output += self.parse_article_npr(html)
else:
output += self.parse_article_jumble(html)
output += self.parse_menu(html)
self.output = output
def parse_main_title(self, html):
output = ""
for header in html.find_all("header"):
for logo_div in header.find_all("div", class_="site-logo"):
for title in logo_div.find_all("a"):
title = title.get_text()
if title:
output = f"# {title}\r\n\r\n"
return output
def parse_secondary_title(self, html):
output = ""
for main in html.find_all("main"):
for header in main.find_all("header"):
for h1 in header.find_all("h1"):
h1 = h1.get_text()
output = f"## {h1}\r\n\r\n"
return output
def parse_article_jumble(self, html):
output = ""
for teaser in html.find_all("article", class_="content-mode--teaser"):
for h2_hack in teaser.find_all("p", class_="h2-hack"):
anchor = h2_hack.find_all("a")
if len(anchor) == 1:
url = self.href(anchor[0])
title = anchor[0].get_text().strip()
output += f"=> {url} {title}\r\n"
return output
def matching_classes(self, elem, classes):
"""True if any given classes are present for elem, False otherwise."""
try:
classes_elem = elem["class"]
except (TypeError, KeyError):
return False
return any(reject in classes_elem for reject in classes)
def parse_article_meta(self, article):
header = article.find_all("header")[0]
for entry_meta in header.find_all("div", class_="entry-meta"):
for author in entry_meta.find_all("li"):
author = author.get_text()
for date in entry_meta.find_all("span", class_="byline-date"):
date = date.get_text()
for figure in header.find_all("figure"):
# They're doing some screwy javascript thing instead of just giving
# a dang img src
for img in figure.find_all("img", class_="b-lazy"):
imgsrc = self.href(img, "data-src")
for figcaption in figure.find_all("figcaption"):
caption = figcaption.get_text()
return (author, date, imgsrc, caption)
def parse_article(self, html):
output = ""
article = html.find_all("article", class_="article")[0]
author, date, imgsrc, caption = self.parse_article_meta(article)
article_chunks = []
rejects = ["block--related-content", "block--doubleclick", "side-ad", "promo-block"]
for detail_content in article.find_all("div", class_="detail-content"):
for flex_content in detail_content.find_all("div", class_="flexible-content-container"):
if self.matching_classes(flex_content, rejects):
continue
for child in flex_content.children:
if self.matching_classes(child, rejects):
continue
try:
text = child.get_text().replace("\n", " ")
except AttributeError:
continue
if child.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
text = "### " + text
article_chunks.append(text)
output += f"By {author} - {date}\r\n\r\n"
output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
for chunk in article_chunks:
output += chunk + "\r\n\r\n"
return output
def parse_article_npr(self, html):
output = ""
article = html.find_all("article", class_="npr_story_post")[0]
author, date, imgsrc, caption = self.parse_article_meta(article)
output += f"By {author} - {date}\r\n\r\n"
output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
for content in article.find_all("div", class_="npr-content"):
output += content.get_text().replace("\n", "\r\n\r\n")
return output
def parse_menu(self, html):
output = ""
menus = html.find_all("ul", id="menu-header-menu-main")
if not menus:
return output
anchors = menus[0].find_all("a")
if anchors:
output = "\r\n## Site Menu\r\n\r\n"
for anchor in anchors:
url = self.href(anchor)
title = anchor.get_text()
output += f"=> {url} {title}\r\n"
return output
HANDLERS = {
"": HandlerDefault,
"thephiladelphiacitizen.org": HandlerCitizen,

View File

@ -6,9 +6,32 @@ from pathlib import Path
from unittest.mock import (Mock, create_autospec, DEFAULT)
from requests_html import HTML
class SessionMock(pgnp.handlers.Session):
"""Session with a fake get() method.
Meant to mock requests.Session.
Subclass this and set PATH to a local HTML file path.
"""
PATH = Path("/")
def get(self, url, **kwargs):
"""Fake a GET request from local disk, with mock response object."""
url_suffix = re.sub("http[s]://[^/]+/*", "", url)
response = Mock()
response.headers = {"Content-Type": "text/html"}
path = self.__class__.PATH
if path.exists():
with open(path, "rb") as f_in:
response.content = f_in.read()
else:
response.content = "404"
return response
class HTMLSessionMock(pgnp.handlers.HTMLSession):
"""HTMLSession with a fake get() method.
Meant to mock requests_html.HTMLSession.
Subclass this and set PATH to a local HTML file path.
"""
@ -58,12 +81,17 @@ class TestBase(unittest.TestCase):
self.tear_down_mock()
def setup_mock(self):
class mocker(HTMLSessionMock):
class mocker(SessionMock):
PATH = self.path / "input.html"
class mocker_html(HTMLSessionMock):
PATH = self.path / "input.html"
hnd = sys.modules["pgnp"].handlers
hnd.HTMLSessionOrig = hnd.HTMLSession
hnd.HTMLSession = mocker
hnd.HTMLSession = mocker_html
hnd.SessionOrig = hnd.Session
hnd.Session = mocker
def tear_down_mock(self):
hnd = sys.modules["pgnp"].handlers
hnd.HTMLSession = hnd.HTMLSessionOrig
hnd.Session = hnd.SessionOrig