44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
from bs4 import BeautifulSoup
|
|
import urllib.request as urllib2
|
|
import re
|
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"
|
|
|
|
|
|
def linkview(url, user_agent=DEFAULT_USER_AGENT):
|
|
req = urllib2.Request(url, data=None, headers={'User-Agent': user_agent})
|
|
html_page = urllib2.urlopen(req)
|
|
soup = BeautifulSoup(html_page, features="lxml")
|
|
links = []
|
|
|
|
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
|
|
links.append(link.get('href'))
|
|
|
|
return links
|
|
|
|
# Some websites employ way to much copy protection for my reader to deal with
|
|
# for these sites we employ an equally brute force method of dealing with it
|
|
#
|
|
def textview(url, user_agent=DEFAULT_USER_AGENT):
|
|
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
|
|
html_page = urllib2.urlopen(req)
|
|
soup = BeautifulSoup(html_page, features="lxml")
|
|
text = soup.find_all(text=True)
|
|
output = ''
|
|
blacklist = [
|
|
'[document]',
|
|
'noscript',
|
|
'header',
|
|
'html',
|
|
'meta',
|
|
'head',
|
|
'input',
|
|
'script',
|
|
'style',
|
|
]
|
|
|
|
for t in text:
|
|
if t.parent.name not in blacklist:
|
|
output += '{} '.format(t)
|
|
|
|
return text
|