snarfbot/web2text.py

#!/usr/bin/env python3

# web2text: a dead simple script to get the important stuff 
# From a website. 

# As usual the WTFPL applies
import sys
import codecs
import newspaper
import click
import re
import os
from io import StringIO

@click.group()
def cli():
    pass

def slugify(s):
    """
    Simplifies ugly strings into something URL-friendly.

    >>> print slugify("[Some] _ Article's Title--")
    some-articles-title

    """

    # "[Some] _ Article's Title--"
    # "[some] _ article's title--"
    s = s.lower()

    # "[some] _ article's_title--"
    # "[some]___article's_title__"
    for c in [' ', '-', '.', '/']:
        s = s.replace(c, '_')

    # "[some]___article's_title__"
    # "some___articles_title__"
    s = re.sub('\W', '', s)

    # "some___articles_title__"
    # "some   articles title  "
    s = s.replace('_', ' ')

    # "some   articles title  "
    # "some articles title "
    s = re.sub('\s+', ' ', s)

    # "some articles title "
    # "some articles title"
    s = s.strip()

    # "some articles title"
    # "some-articles-title"
    s = s.replace(' ', '-')

    return s

def pprint_list(l):
    s = ""
    for i in l:
        s = s + str(i) + ", "
    return s

@cli.command()
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
    #
    pack = snarf(uri)
    if save:
        svsname = slugify(pack[0]) + '.txt'
        fp = codecs.open(svsname, "w", 'utf-8')
        fp.write(pack[1])
        fp.close()
    else:
        click.echo_via_pager(pack[1])

def snarf(uri):
    target = newspaper.Article(uri)
    target.download()
    target.parse()
    header = target.title + "\n" + "By " + pprint_list(target.authors)
    header = header + "\n\n"
    header = header + str(target.publish_date) + "\nFrom: " + uri
    header = header + "\n\n"
    fp = StringIO() 
    fp.write(header)
    sep = '-' * 32 + "\n\n"
    fp.write(sep)
    fp.write(target.text)
    return (target.title, fp.getvalue())

if __name__ == '__main__':
    cli()
Inital Commit 2020-12-29 05:06:37 +00:00			`#!/usr/bin/env python3`

			`# web2text: a dead simple script to get the important stuff`
			`# From a website.`

			`# As usual the WTFPL applies`
			`import sys`
			`import codecs`
			`import newspaper`
			`import click`
			`import re`
			`import os`
			`from io import StringIO`

			`@click.group()`
			`def cli():`
			`pass`

			`def slugify(s):`
			`"""`
			`Simplifies ugly strings into something URL-friendly.`

			`>>> print slugify("[Some] _ Article's Title--")`
			`some-articles-title`

			`"""`

			`# "[Some] _ Article's Title--"`
			`# "[some] _ article's title--"`
			`s = s.lower()`

			`# "[some] _ article's_title--"`
			`# "[some]___article's_title__"`
			`for c in [' ', '-', '.', '/']:`
			`s = s.replace(c, '_')`

			`# "[some]___article's_title__"`
			`# "some___articles_title__"`
			`s = re.sub('\W', '', s)`

			`# "some___articles_title__"`
			`# "some articles title "`
			`s = s.replace('_', ' ')`

			`# "some articles title "`
			`# "some articles title "`
			`s = re.sub('\s+', ' ', s)`

			`# "some articles title "`
			`# "some articles title"`
			`s = s.strip()`

			`# "some articles title"`
			`# "some-articles-title"`
			`s = s.replace(' ', '-')`

			`return s`

			`def pprint_list(l):`
			`s = ""`
			`for i in l:`
			`s = s + str(i) + ", "`
			`return s`

			`@cli.command()`
			`@click.argument('uri')`
			`@click.option('--save', is_flag=True)`
			`def scrape(uri, save):`
			`#`
			`pack = snarf(uri)`
			`if save:`
			`svsname = slugify(pack[0]) + '.txt'`
			`fp = codecs.open(svsname, "w", 'utf-8')`
			`fp.write(pack[1])`
			`fp.close()`
			`else:`
			`click.echo_via_pager(pack[1])`

			`def snarf(uri):`
			`target = newspaper.Article(uri)`
			`target.download()`
			`target.parse()`
			`header = target.title + "\n" + "By " + pprint_list(target.authors)`
			`header = header + "\n\n"`
			`header = header + str(target.publish_date) + "\nFrom: " + uri`
			`header = header + "\n\n"`
			`fp = StringIO()`
			`fp.write(header)`
			`sep = '-' * 32 + "\n\n"`
			`fp.write(sep)`
			`fp.write(target.text)`
			`return (target.title, fp.getvalue())`

			`if __name__ == '__main__':`
			`cli()`