fix WHYY, switch to BeautifulSoup

requests_html fails to parse valid HTML in some cases (<p> without closing </p>) and BeautifulSoup seems easier to use anyway. Switching to that for WHYY and maybe the rest down the road.
2022-09-06 20:01:21 +00:00 · 2022-09-06 20:01:21 +00:00 · 9e4ca52721
parent 86d25eeec3
commit 9e4ca52721
2 changed files with 233 additions and 7 deletions
--- a/pgnp/handlers.py
+++ b/pgnp/handlers.py
@ -2,6 +2,8 @@ import re
 import sys
 from pathlib import Path
 from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+from requests import Session
 from requests.exceptions import ReadTimeout, ConnectionError

 PREFIX = """Gemini proxies of a few local news sources:"""
@ -41,7 +43,8 @@ def slug(txt):
    """Replace all non-alphanumeric characters with dashes."""
    return re.sub("[^A-Za-z0-9]", "-", txt)

-class Handler:
+class HandlerOld:
+    """The older Handler implementation, based on requests-html."""

    def __init__(self, path):
        self.session = HTMLSession()
@ -94,7 +97,7 @@ class Handler:
            return link.replace(self.root_html, self.root_gmni)


-class HandlerDefault(Handler):
+class HandlerDefault(HandlerOld):

    def parse(self):
        self.output += PREFIX + "\r\n\r\n"
@ -104,7 +107,7 @@ class HandlerDefault(Handler):
        self.output += SUFFIX + "\r\n"


-class HandlerCitizen(Handler):
+class HandlerCitizen(HandlerOld):

    def __init__(self, path):
        super().__init__(path)
@ -257,7 +260,7 @@ class HandlerCitizen(Handler):

 # Though I just found: https://billypenn.com/wp-json/ !
 # and now also https://thephiladelphiacitizen.org/wp-json !!
-class HandlerBillyPenn(Handler):
+class HandlerBillyPenn(HandlerOld):

    def __init__(self, path):
        # I don't know why my use of the status code 10 seems to do .../?query
@ -419,7 +422,7 @@ class HandlerBillyPenn(Handler):
        return output


-class HandlerWHYYNews(Handler):
+class HandlerWHYYNewsOld(HandlerOld):

    def __init__(self, path):
        super().__init__(path)
@ -526,6 +529,201 @@ class HandlerWHYYNews(Handler):
        return output


+class Handler:
+    """Page handler built on requests and BeautifulSoup instead of requests-html."""
+
+    def __init__(self, path):
+        self.session = Session()
+        self.path_base = re.sub("/.*$", "", path)
+        self.path = re.sub("^[^/?]+", "", path)
+        self.root_html = ""
+        self.root_gmni = ""
+        self.status_code = "20"
+        self.ctype = "text/gemini"
+        self.output = ""
+
+    def get(self):
+        try:
+            response = self.session.get(self.root_html + self.path, timeout=TIMEOUT)
+        except (ReadTimeout, ConnectionError) as error:
+            self.output = "Network error accessing:\r\n\r\n"
+            self.output += f"=> {self.root_gmni}{self.path}\r\n"
+            response = None
+        return response
+
+    def parse(self):
+        self.output = f"No handler defined for {self.path_base}\r\n"
+        self.output += "=> . Go back to the index\r\n"
+
+    def make_proxy_snippet(self):
+        links = [
+            (self.root_html + self.path, "HTML Original"),
+            (self.root_gmni, "Proxy Home Page")]
+        output = "\r\n".join([f"=> {lnk[0]} {lnk[1]}" for lnk in links])
+        output += "\r\n\r\n"
+        return output
+
+    def render(self):
+        sys.stdout.buffer.write(f"{self.status_code} {self.ctype}\r\n".encode())
+        if self.status_code == "10" or self.ctype.startswith("text/"):
+            sys.stdout.buffer.write(self.output.encode())
+        else:
+            sys.stdout.buffer.write(self.output)
+
+    def href(self, anchor, attrib="href"):
+        try:
+            link = anchor.attrs.get(attrib, "")
+        except AttributeError:
+            link = anchor
+        if not link or not self.root_html or not self.root_gmni:
+            return link
+        if link.startswith("/"):
+            return self.root_gmni + link
+        else:
+            return link.replace(self.root_html, self.root_gmni)
+
+
+class HandlerWHYYNews(Handler):
+
+    def __init__(self, path):
+        super().__init__(path)
+        self.root_html = "https://whyy.org/"
+        self.root_gmni = ROOT + "whyy.org/"
+
+    def parse(self):
+
+        r = self.get()
+        if not r:
+            return
+
+        if r.headers["Content-Type"] in ["image/png", "image/jpeg"]:
+            self.ctype = "image/png"
+            self.output = r.content
+            return
+
+        output = ""
+
+        html = BeautifulSoup(r.content, features="lxml")
+        output += self.parse_main_title(html)
+        output += self.make_proxy_snippet()
+        output += self.parse_secondary_title(html)
+        actual_articles = html.find_all("article", class_="article")
+        if actual_articles:
+            output += self.parse_article(html)
+        elif html.find_all("article", class_="npr_story_post"):
+            output += self.parse_article_npr(html)
+        else:
+            output += self.parse_article_jumble(html)
+
+        output += self.parse_menu(html)
+        self.output = output
+
+    def parse_main_title(self, html):
+        output = ""
+        for header in html.find_all("header"):
+            for logo_div in header.find_all("div", class_="site-logo"):
+                for title in logo_div.find_all("a"):
+                    title = title.get_text()
+                    if title:
+                        output = f"# {title}\r\n\r\n"
+        return output
+
+    def parse_secondary_title(self, html):
+        output = ""
+        for main in html.find_all("main"):
+            for header in main.find_all("header"):
+                for h1 in header.find_all("h1"):
+                    h1 = h1.get_text()
+                    output = f"## {h1}\r\n\r\n"
+        return output
+
+    def parse_article_jumble(self, html):
+        output = ""
+        for teaser in html.find_all("article", class_="content-mode--teaser"):
+            for h2_hack in teaser.find_all("p", class_="h2-hack"):
+                anchor = h2_hack.find_all("a")
+                if len(anchor) == 1:
+                    url = self.href(anchor[0])
+                    title = anchor[0].get_text().strip()
+                    output += f"=> {url} {title}\r\n"
+        return output
+
+    def matching_classes(self, elem, classes):
+        """True if any given classes are present for elem, False otherwise."""
+        try:
+            classes_elem = elem["class"]
+        except (TypeError, KeyError):
+            return False
+        return any(reject in classes_elem for reject in classes)
+
+    def parse_article_meta(self, article):
+        header = article.find_all("header")[0]
+        for entry_meta in header.find_all("div", class_="entry-meta"):
+            for author in entry_meta.find_all("li"):
+                author = author.get_text()
+            for date in entry_meta.find_all("span", class_="byline-date"):
+                date = date.get_text()
+        for figure in header.find_all("figure"):
+            # They're doing some screwy javascript thing instead of just giving
+            # a dang img src
+            for img in figure.find_all("img", class_="b-lazy"):
+                imgsrc = self.href(img, "data-src")
+            for figcaption in figure.find_all("figcaption"):
+                caption = figcaption.get_text()
+        return (author, date, imgsrc, caption)
+
+    def parse_article(self, html):
+        output = ""
+        article = html.find_all("article", class_="article")[0]
+        author, date, imgsrc, caption = self.parse_article_meta(article)
+        article_chunks = []
+        rejects = ["block--related-content", "block--doubleclick", "side-ad", "promo-block"]
+        for detail_content in article.find_all("div", class_="detail-content"):
+            for flex_content in detail_content.find_all("div", class_="flexible-content-container"):
+                if self.matching_classes(flex_content, rejects):
+                    continue
+                for child in flex_content.children:
+                    if self.matching_classes(child, rejects):
+                        continue
+                    try:
+                        text = child.get_text().replace("\n", " ")
+                    except AttributeError:
+                        continue
+                    if child.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+                        text = "### " + text
+                    article_chunks.append(text)
+
+        output += f"By {author} - {date}\r\n\r\n"
+        output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
+        for chunk in article_chunks:
+            output += chunk + "\r\n\r\n"
+        return output
+
+    def parse_article_npr(self, html):
+        output = ""
+        article = html.find_all("article", class_="npr_story_post")[0]
+        author, date, imgsrc, caption = self.parse_article_meta(article)
+        output += f"By {author} - {date}\r\n\r\n"
+        output += f"=> {imgsrc} Figure: {caption}\r\n\r\n"
+        for content in article.find_all("div", class_="npr-content"):
+            output += content.get_text().replace("\n", "\r\n\r\n")
+        return output
+
+    def parse_menu(self, html):
+        output = ""
+        menus = html.find_all("ul", id="menu-header-menu-main")
+        if not menus:
+            return output
+        anchors = menus[0].find_all("a")
+        if anchors:
+            output = "\r\n## Site Menu\r\n\r\n"
+            for anchor in anchors:
+                url = self.href(anchor)
+                title = anchor.get_text()
+                output += f"=> {url} {title}\r\n"
+        return output
+
+
 HANDLERS = {
    "": HandlerDefault,
    "thephiladelphiacitizen.org": HandlerCitizen,
--- a/test_pgnp/util.py
+++ b/test_pgnp/util.py
@ -6,9 +6,32 @@ from pathlib import Path
 from unittest.mock import (Mock, create_autospec, DEFAULT)
 from requests_html import HTML

+class SessionMock(pgnp.handlers.Session):
+    """Session with a fake get() method.
+
+    Meant to mock requests.Session.
+    Subclass this and set PATH to a local HTML file path.
+    """
+
+    PATH = Path("/")
+
+    def get(self, url, **kwargs):
+        """Fake a GET request from local disk, with mock response object."""
+        url_suffix = re.sub("http[s]://[^/]+/*", "", url)
+        response = Mock()
+        response.headers = {"Content-Type": "text/html"}
+        path = self.__class__.PATH
+        if path.exists():
+            with open(path, "rb") as f_in:
+                response.content = f_in.read()
+        else:
+            response.content = "404"
+        return response
+
 class HTMLSessionMock(pgnp.handlers.HTMLSession):
    """HTMLSession with a fake get() method.

+    Meant to mock requests_html.HTMLSession.
    Subclass this and set PATH to a local HTML file path.
    """

@ -58,12 +81,17 @@ class TestBase(unittest.TestCase):
        self.tear_down_mock()

    def setup_mock(self):
-        class mocker(HTMLSessionMock):
+        class mocker(SessionMock):
+            PATH = self.path / "input.html"
+        class mocker_html(HTMLSessionMock):
            PATH = self.path / "input.html"
        hnd = sys.modules["pgnp"].handlers
        hnd.HTMLSessionOrig = hnd.HTMLSession
-        hnd.HTMLSession = mocker
+        hnd.HTMLSession = mocker_html
+        hnd.SessionOrig = hnd.Session
+        hnd.Session = mocker
    
    def tear_down_mock(self):
        hnd = sys.modules["pgnp"].handlers
        hnd.HTMLSession = hnd.HTMLSessionOrig
+        hnd.Session = hnd.SessionOrig