snarfbot/web2text.py

52 lines
1.1 KiB
Python
Raw Normal View History

2020-12-29 05:06:37 +00:00
#!/usr/bin/env python3
# web2text: a dead simple script to get the important stuff
# From a website.
# As usual the WTFPL applies
import codecs
import click
from snarfbot.snarf3k import snarf, slugify
2021-04-04 03:38:41 +00:00
from snarfbot.linkview import textview, DEFAULT_USER_AGENT
from io import StringIO
2020-12-29 05:06:37 +00:00
@click.group()
def cli():
pass
@cli.command()
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
2021-04-04 03:38:41 +00:00
"""
Attempt an elagent scraping of the page.
Save results to a file if desired
"""
2020-12-29 05:06:37 +00:00
pack = snarf(uri)
if save:
svsname = slugify(pack[0]) + '.txt'
fp = codecs.open(svsname, "w", 'utf-8')
fp.write(pack[1])
fp.close()
else:
click.echo_via_pager(pack[1])
2021-04-04 03:38:41 +00:00
@cli.command()
@click.argument('uri')
@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)
def uglydump(uri, user_agent):
"""
Less Elagent but more functional page scrape dump to stdout
"""
dump = StringIO()
l = textview(uri, user_agent=user_agent)
for s in l:
dump.write(s)
click.echo(dump.getvalue())
2020-12-29 05:06:37 +00:00
if __name__ == '__main__':
cli()