snarfbot/snarfbot/linkview.py

from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"


def linkview(url, user_agent=DEFAULT_USER_AGENT):
    req = urllib2.Request(url, data=None, headers={'User-Agent': user_agent})
    html_page = urllib2.urlopen(req)
    soup = BeautifulSoup(html_page, features="lxml")
    links = []

    for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
        links.append(link.get('href'))

    return links

# Some websites employ way to much copy protection for my reader to deal with
# for these sites we employ an equally brute force method of dealing with it
#
def textview(url, user_agent=DEFAULT_USER_AGENT):
    req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
    html_page = urllib2.urlopen(req)
    soup = BeautifulSoup(html_page, features="lxml")
    text = soup.find_all(text=True)
    output = ''
    blacklist = [
    '[document]',
    'noscript',
    'header',
    'html',
    'meta',
    'head',
    'input',
    'script',
    'style',
    ]

    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)

    return text