fix WHYY, switch to BeautifulSoup
requests_html fails to parse valid HTML in some cases (<p> without closing </p>) and BeautifulSoup seems easier to use anyway. Switching to that for WHYY and maybe the rest down the road.
This commit is contained in:
parent
86d25eeec3
commit
9e4ca52721
208
pgnp/handlers.py
208
pgnp/handlers.py
|
@ -2,6 +2,8 @@ import re
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from requests_html import HTMLSession
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import Session
|
||||
from requests.exceptions import ReadTimeout, ConnectionError
|
||||
|
||||
PREFIX = """Gemini proxies of a few local news sources:"""
|
||||
|
@ -41,7 +43,8 @@ def slug(txt):
|
|||
"""Replace all non-alphanumeric characters with dashes."""
|
||||
return re.sub("[^A-Za-z0-9]", "-", txt)
|
||||
|
||||
class Handler:
|
||||
class HandlerOld:
|
||||
"""The older Handler implementation, based on requests-html."""
|
||||
|
||||
def __init__(self, path):
|
||||
self.session = HTMLSession()
|
||||
|
@ -94,7 +97,7 @@ class Handler:
|
|||
return link.replace(self.root_html, self.root_gmni)
|
||||
|
||||
|
||||
class HandlerDefault(Handler):
|
||||
class HandlerDefault(HandlerOld):
|
||||
|
||||
def parse(self):
|
||||
self.output += PREFIX + "\r\n\r\n"
|
||||
|
@ -104,7 +107,7 @@ class HandlerDefault(Handler):
|
|||
self.output += SUFFIX + "\r\n"
|
||||
|
||||
|
||||
class HandlerCitizen(Handler):
|
||||
class HandlerCitizen(HandlerOld):
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
@ -257,7 +260,7 @@ class HandlerCitizen(Handler):
|
|||
|
||||
# Though I just found: https://billypenn.com/wp-json/ !
|
||||
# and now also https://thephiladelphiacitizen.org/wp-json !!
|
||||
class HandlerBillyPenn(Handler):
|
||||
class HandlerBillyPenn(HandlerOld):
|
||||
|
||||
def __init__(self, path):
|
||||
# I don't know why my use of the status code 10 seems to do .../?query
|
||||
|
@ -419,7 +422,7 @@ class HandlerBillyPenn(Handler):
|
|||
return output
|
||||
|
||||
|
||||
class HandlerWHYYNews(Handler):
|
||||
class HandlerWHYYNewsOld(HandlerOld):
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
|
@ -526,6 +529,201 @@ class HandlerWHYYNews(Handler):
|
|||
return output
|
||||
|
||||
|
||||
class Handler:
|
||||
"""Page handler built on requests and BeautifulSoup instead of requests-html."""
|
||||
|
||||
def __init__(self, path):
|
||||
self.session = Session()
|
||||
self.path_base = re.sub("/.*$", "", path)
|
||||
self.path = re.sub("^[^/?]+", "", path)
|
||||
self.root_html = ""
|
||||
self.root_gmni = ""
|
||||
self.status_code = "20"
|
||||
self.ctype = "text/gemini"
|
||||
self.output = ""
|
||||
|
||||
def get(self):
|
||||
try:
|
||||
response = self.session.get(self.root_html + self.path, timeout=TIMEOUT)
|
||||
except (ReadTimeout, ConnectionError) as error:
|
||||
self.output = "Network error accessing:\r\n\r\n"
|
||||
self.output += f"=> {self.root_gmni}{self.path}\r\n"
|
||||
response = None
|
||||
return response
|
||||
|
||||
def parse(self):
|
||||
self.output = f"No handler defined for {self.path_base}\r\n"
|
||||
self.output += "=> . Go back to the index\r\n"
|
||||
|
||||
def make_proxy_snippet(self):
|
||||
links = [
|
||||
(self.root_html + self.path, "HTML Original"),
|
||||
(self.root_gmni, "Proxy Home Page")]
|
||||
output = "\r\n".join([f"=> {lnk[0]} {lnk[1]}" for lnk in links])
|
||||
output += "\r\n\r\n"
|
||||
return output
|
||||
|
||||
def render(self):
|
||||
sys.stdout.buffer.write(f"{self.status_code} {self.ctype}\r\n".encode())
|
||||
if self.status_code == "10" or self.ctype.startswith("text/"):
|
||||
sys.stdout.buffer.write(self.output.encode())
|
||||
else:
|
||||
sys.stdout.buffer.write(self.output)
|
||||
|
||||
def href(self, anchor, attrib="href"):
|
||||
try:
|
||||
link = anchor.attrs.get(attrib, "")
|
||||
except AttributeError:
|
||||
link = anchor
|
||||
if not link or not self.root_html or not self.root_gmni:
|
||||
return link
|
||||
if link.startswith("/"):
|
||||
return self.root_gmni + link
|
||||
else:
|
||||
return link.replace(self.root_html, self.root_gmni)
|
||||
|
||||
|
||||
class HandlerWHYYNews(Handler):
|
||||
|
||||
def __init__(self, path):
|
||||
super().__init__(path)
|
||||
self.root_html = "https://whyy.org/"
|
||||
self.root_gmni = ROOT + "whyy.org/"
|
||||
|
||||
def parse(self):
|
||||
|
||||
r = self.get()
|
||||
if not r:
|
||||
return
|
||||
|
||||
if r.headers["Content-Type"] in ["image/png", "image/jpeg"]:
|
||||
self.ctype = "image/png"
|
||||
self.output = r.content
|
||||
return
|
||||
|
||||
output = ""
|
||||
|
||||
html = BeautifulSoup(r.content, features="lxml")
|
||||
output += self.parse_main_title(html)
|
||||
output += self.make_proxy_snippet()
|
||||
output += self.parse_secondary_title(html)
|
||||
actual_articles = html.find_all("article", class_="article")
|
||||
if actual_articles:
|
||||
output += self.parse_article(html)
|
||||
elif html.find_all("article", class_="npr_story_post"):
|
||||
output += self.parse_article_npr(html)
|
||||
else:
|
||||
output += self.parse_article_jumble(html)
|
||||
|
||||
output += self.parse_menu(html)
|
||||
self.output = output
|
||||
|
||||
def parse_main_title(self, html):
|
||||
output = ""
|
||||
for header in html.find_all("header"):
|
||||
for logo_div in header.find_all("div", class_="site-logo"):
|
||||
for title in logo_div.find_all("a"):
|
||||
title = title.get_text()
|
||||
if title:
|
||||
output = f"# {title}\r\n\r\n"
|
||||
return output
|
||||
|
||||
def parse_secondary_title(self, html):
|
||||
output = ""
|
||||
for main in html.find_all("main"):
|
||||
for header in main.find_all("header"):
|
||||
for h1 in header.find_all("h1"):
|
||||
h1 = h1.get_text()
|
||||
output = f"## {h1}\r\n\r\n"
|
||||
return output
|
||||
|
||||
def parse_article_jumble(self, html):
|
||||
output = ""
|
||||
for teaser in html.find_all("article", class_="content-mode--teaser"):
|
||||
for h2_hack in teaser.find_all("p", class_="h2-hack"):
|
||||
anchor = h2_hack.find_all("a")
|
||||
if len(anchor) == 1:
|
||||
url = self.href(anchor[0])
|
||||
title = anchor[0].get_text().strip()
|
||||
output += f"=> {url} {title}\r\n"
|
||||
return output
|
||||
|
||||
def matching_classes(self, elem, classes):
|
||||
"""True if any given classes are present for elem, False otherwise."""
|
||||
try:
|
||||
classes_elem = elem["class"]
|
||||
except (TypeError, KeyError):
|
||||
return False
|
||||
return any(reject in classes_elem for reject in classes)
|
||||
|
||||
def parse_article_meta(self, article):
|
||||
header = article.find_all("header")[0]
|
||||
for entry_meta in header.find_all("div", class_="entry-meta"):
|
||||
for author in entry_meta.find_all("li"):
|
||||
author = author.get_text()
|
||||
for date in entry_meta.find_all("span", class_="byline-date"):
|
||||
date = date.get_text()
|
||||
for figure in header.find_all("figure"):
|
||||
# They're doing some screwy javascript thing instead of just giving
|
||||
# a dang img src
|
||||
for img in figure.find_all("img", class_="b-lazy"):
|
||||
imgsrc = self.href(img, "data-src")
|
||||
for figcaption in figure.find_all("figcaption"):
|
||||
caption = figcaption.get_text()
|
||||
return (author, date, imgsrc, caption)
|
||||
|
||||
def parse_article(self, html):
|
||||
output = ""
|
||||
article = html.find_all("article", class_="article")[0]
|
||||
author, date, imgsrc, caption = self.parse_article_meta(article)
|
||||
article_chunks = []
|
||||
rejects = ["block--related-content", "block--doubleclick", "side-ad", "promo-block"]
|
||||
for detail_content in article.find_all("div", class_="detail-content"):
|
||||
for flex_content in detail_content.find_all("div", class_="flexible-content-container"):
|
||||
if self.matching_classes(flex_content, rejects):
|
||||
continue
|
||||
for child in flex_content.children:
|
||||
if self.matching_classes(child, rejects):
|
||||
continue
|
||||
try:
|
||||
text = child.get_text().replace("\n", " ")
|
||||
except AttributeError:
|
||||
continue
|
||||
if child.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
text = "### " + text
|
||||
article_chunks.append(text)
|
||||
|
||||
output += f"By {author} - {date}\r\n\r\n"
|
||||
output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
|
||||
for chunk in article_chunks:
|
||||
output += chunk + "\r\n\r\n"
|
||||
return output
|
||||
|
||||
def parse_article_npr(self, html):
|
||||
output = ""
|
||||
article = html.find_all("article", class_="npr_story_post")[0]
|
||||
author, date, imgsrc, caption = self.parse_article_meta(article)
|
||||
output += f"By {author} - {date}\r\n\r\n"
|
||||
output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
|
||||
for content in article.find_all("div", class_="npr-content"):
|
||||
output += content.get_text().replace("\n", "\r\n\r\n")
|
||||
return output
|
||||
|
||||
def parse_menu(self, html):
|
||||
output = ""
|
||||
menus = html.find_all("ul", id="menu-header-menu-main")
|
||||
if not menus:
|
||||
return output
|
||||
anchors = menus[0].find_all("a")
|
||||
if anchors:
|
||||
output = "\r\n## Site Menu\r\n\r\n"
|
||||
for anchor in anchors:
|
||||
url = self.href(anchor)
|
||||
title = anchor.get_text()
|
||||
output += f"=> {url} {title}\r\n"
|
||||
return output
|
||||
|
||||
|
||||
HANDLERS = {
|
||||
"": HandlerDefault,
|
||||
"thephiladelphiacitizen.org": HandlerCitizen,
|
||||
|
|
|
@ -6,9 +6,32 @@ from pathlib import Path
|
|||
from unittest.mock import (Mock, create_autospec, DEFAULT)
|
||||
from requests_html import HTML
|
||||
|
||||
class SessionMock(pgnp.handlers.Session):
|
||||
"""Session with a fake get() method.
|
||||
|
||||
Meant to mock requests.Session.
|
||||
Subclass this and set PATH to a local HTML file path.
|
||||
"""
|
||||
|
||||
PATH = Path("/")
|
||||
|
||||
def get(self, url, **kwargs):
|
||||
"""Fake a GET request from local disk, with mock response object."""
|
||||
url_suffix = re.sub("http[s]://[^/]+/*", "", url)
|
||||
response = Mock()
|
||||
response.headers = {"Content-Type": "text/html"}
|
||||
path = self.__class__.PATH
|
||||
if path.exists():
|
||||
with open(path, "rb") as f_in:
|
||||
response.content = f_in.read()
|
||||
else:
|
||||
response.content = "404"
|
||||
return response
|
||||
|
||||
class HTMLSessionMock(pgnp.handlers.HTMLSession):
|
||||
"""HTMLSession with a fake get() method.
|
||||
|
||||
Meant to mock requests_html.HTMLSession.
|
||||
Subclass this and set PATH to a local HTML file path.
|
||||
"""
|
||||
|
||||
|
@ -58,12 +81,17 @@ class TestBase(unittest.TestCase):
|
|||
self.tear_down_mock()
|
||||
|
||||
def setup_mock(self):
|
||||
class mocker(HTMLSessionMock):
|
||||
class mocker(SessionMock):
|
||||
PATH = self.path / "input.html"
|
||||
class mocker_html(HTMLSessionMock):
|
||||
PATH = self.path / "input.html"
|
||||
hnd = sys.modules["pgnp"].handlers
|
||||
hnd.HTMLSessionOrig = hnd.HTMLSession
|
||||
hnd.HTMLSession = mocker
|
||||
hnd.HTMLSession = mocker_html
|
||||
hnd.SessionOrig = hnd.Session
|
||||
hnd.Session = mocker
|
||||
|
||||
def tear_down_mock(self):
|
||||
hnd = sys.modules["pgnp"].handlers
|
||||
hnd.HTMLSession = hnd.HTMLSessionOrig
|
||||
hnd.Session = hnd.SessionOrig
|
||||
|
|
Loading…
Reference in New Issue