A hipster web crawler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
snarfbot/web2text.py

51 lines
1.1 KiB

#!/usr/bin/env python3
# web2text: a dead simple script to get the important stuff
# From a website.
# As usual the WTFPL applies
import codecs
import click
from snarfbot.snarf3k import snarf, slugify
from snarfbot.linkview import textview, DEFAULT_USER_AGENT
from io import StringIO
@click.group()
def cli():
pass
@cli.command()
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
"""
Attempt an elagent scraping of the page.
Save results to a file if desired
"""
pack = snarf(uri)
if save:
svsname = slugify(pack[0]) + '.txt'
fp = codecs.open(svsname, "w", 'utf-8')
fp.write(pack[1])
fp.close()
else:
click.echo_via_pager(pack[1])
@cli.command()
@click.argument('uri')
@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)
def uglydump(uri, user_agent):
"""
Less Elagent but more functional page scrape dump to stdout
"""
dump = StringIO()
l = textview(uri, user_agent=user_agent)
for s in l:
dump.write(s)
click.echo(dump.getvalue())
if __name__ == '__main__':
cli()