snarfbot/web2text.py

#!/usr/bin/env python3

# web2text: a dead simple script to get the important stuff 
# From a website. 

# As usual the WTFPL applies
import codecs
import click
from snarfbot.snarf3k import snarf, slugify
from snarfbot.linkview import textview, DEFAULT_USER_AGENT
from io import StringIO

@click.group()
def cli():
    pass


@cli.command()
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
    """
    Attempt an elagent scraping of the page. 
    Save results to a file if desired

    """
    pack = snarf(uri)
    if save:
        svsname = slugify(pack[0]) + '.txt'
        fp = codecs.open(svsname, "w", 'utf-8')
        fp.write(pack[1])
        fp.close()
    else:
        click.echo_via_pager(pack[1])

@cli.command()
@click.argument('uri')
@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)
def uglydump(uri, user_agent):
    """
    Less Elagent but more functional page scrape dump to stdout
    """
    
    dump = StringIO()
    l = textview(uri, user_agent=user_agent)
    for s in l:
        dump.write(s)
    click.echo(dump.getvalue())

if __name__ == '__main__':
    cli()
Inital Commit 2020-12-29 05:06:37 +00:00			`#!/usr/bin/env python3`

			`# web2text: a dead simple script to get the important stuff`
			`# From a website.`

			`# As usual the WTFPL applies`
			`import codecs`
			`import click`
Refactor in preperation building the crawler 2020-12-31 02:46:12 +00:00			`from snarfbot.snarf3k import snarf, slugify`
Add uglydump subcommand to web2text. 2021-04-04 03:38:41 +00:00			`from snarfbot.linkview import textview, DEFAULT_USER_AGENT`
			`from io import StringIO`
Inital Commit 2020-12-29 05:06:37 +00:00
			`@click.group()`
			`def cli():`
			`pass`


			`@cli.command()`
			`@click.argument('uri')`
			`@click.option('--save', is_flag=True)`
			`def scrape(uri, save):`
Add uglydump subcommand to web2text. 2021-04-04 03:38:41 +00:00			`"""`
			`Attempt an elagent scraping of the page.`
			`Save results to a file if desired`

			`"""`
Inital Commit 2020-12-29 05:06:37 +00:00			`pack = snarf(uri)`
			`if save:`
			`svsname = slugify(pack[0]) + '.txt'`
			`fp = codecs.open(svsname, "w", 'utf-8')`
			`fp.write(pack[1])`
			`fp.close()`
			`else:`
			`click.echo_via_pager(pack[1])`

Add uglydump subcommand to web2text. 2021-04-04 03:38:41 +00:00			`@cli.command()`
			`@click.argument('uri')`
			`@click.option('--user-agent', default=DEFAULT_USER_AGENT, show_default=True)`
			`def uglydump(uri, user_agent):`
			`"""`
			`Less Elagent but more functional page scrape dump to stdout`
			`"""`

			`dump = StringIO()`
			`l = textview(uri, user_agent=user_agent)`
			`for s in l:`
			`dump.write(s)`
			`click.echo(dump.getvalue())`

Inital Commit 2020-12-29 05:06:37 +00:00			`if __name__ == '__main__':`
			`cli()`