Add uglydump subcommand to web2text.

This commit is contained in:
Matt Arnold 2021-04-03 23:38:41 -04:00
parent f0caf13470
commit 4f5efa818b
3 changed files with 61 additions and 4 deletions

View File

@ -2,3 +2,11 @@
This will eventually be a web crawler that saves websites in plaintext files.
For now please enjoy a few cli tools, written as POC. Comments, compliments, complaints, and pull requests accepted.
## web2text
Command line tool that does exactly what it says on the tin. Extract the content of a web document to plain text. With a choice of two scraping engines.
The scrape command will attempt scraping with Newspaper3k. Which produces pretty text file, and attempts to filter out things like comments sections, page navgation links and so forth. However may truncate long pages. Has trouble with some javascript navigation elements. And uses a fairly obvious user agent that may be blocked or limited by some sites.
The uglydump command will dump the contents of a page with minimal filtering using a spoofed user agent by default. You may get javascript source and style information in your output. But the minimal filtering was chosen in order not to lose potentially important data. The default user agent chosen is a currentish version of firefox on Ubuntu X11

View File

@ -1,9 +1,11 @@
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0"
def linkview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
req = urllib2.Request(url, data=None, headers={'User-Agent': user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
links = []
@ -11,4 +13,31 @@ def linkview(url, user_agent=DEFAULT_USER_AGENT):
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
links.append(link.get('href'))
return links
return links
# Some websites employ way to much copy protection for my reader to deal with
# for these sites we employ an equally brute force method of dealing with it
#
def textview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
text = soup.find_all(text=True)
output = ''
blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
]
for t in text:
if t.parent.name not in blacklist:
output += '{} '.format(t)
return text

View File

@ -7,6 +7,8 @@
import codecs
import click
from snarfbot.snarf3k import snarf, slugify
from snarfbot.linkview import textview, DEFAULT_USER_AGENT
from io import StringIO
@click.group()
def cli():
@ -17,7 +19,11 @@ def cli():
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
#
"""
Attempt an elagent scraping of the page.
Save results to a file if desired
"""
pack = snarf(uri)
if save:
svsname = slugify(pack[0]) + '.txt'
@ -27,5 +33,19 @@ def scrape(uri, save):
else:
click.echo_via_pager(pack[1])
@cli.command()
@click.argument('uri')
@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)
def uglydump(uri, user_agent):
"""
Less Elagent but more functional page scrape dump to stdout
"""
dump = StringIO()
l = textview(uri, user_agent=user_agent)
for s in l:
dump.write(s)
click.echo(dump.getvalue())
if __name__ == '__main__':
cli()