snarfbot/snarfbot/linkview.py

44 lines
1.2 KiB
Python

from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"
def linkview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent': user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
links.append(link.get('href'))
return links
# Some websites employ way to much copy protection for my reader to deal with
# for these sites we employ an equally brute force method of dealing with it
#
def textview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return text