52 lines
1.1 KiB
Python
Executable File
52 lines
1.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# web2text: a dead simple script to get the important stuff
|
|
# From a website.
|
|
|
|
# As usual the WTFPL applies
|
|
import codecs
|
|
import click
|
|
from snarfbot.snarf3k import snarf, slugify
|
|
from snarfbot.linkview import textview, DEFAULT_USER_AGENT
|
|
from io import StringIO
|
|
|
|
@click.group()
|
|
def cli():
|
|
pass
|
|
|
|
|
|
@cli.command()
|
|
@click.argument('uri')
|
|
@click.option('--save', is_flag=True)
|
|
def scrape(uri, save):
|
|
"""
|
|
Attempt an elagent scraping of the page.
|
|
Save results to a file if desired
|
|
|
|
"""
|
|
pack = snarf(uri)
|
|
if save:
|
|
svsname = slugify(pack[0]) + '.txt'
|
|
fp = codecs.open(svsname, "w", 'utf-8')
|
|
fp.write(pack[1])
|
|
fp.close()
|
|
else:
|
|
click.echo_via_pager(pack[1])
|
|
|
|
@cli.command()
|
|
@click.argument('uri')
|
|
@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)
|
|
def uglydump(uri, user_agent):
|
|
"""
|
|
Less Elagent but more functional page scrape dump to stdout
|
|
"""
|
|
|
|
dump = StringIO()
|
|
l = textview(uri, user_agent=user_agent)
|
|
for s in l:
|
|
dump.write(s)
|
|
click.echo(dump.getvalue())
|
|
|
|
if __name__ == '__main__':
|
|
cli()
|