#!/usr/bin/env python3 # web2text: a dead simple script to get the important stuff # From a website. # As usual the WTFPL applies import sys import codecs import newspaper import click import re import os from io import StringIO @click.group() def cli(): pass def slugify(s): """ Simplifies ugly strings into something URL-friendly. >>> print slugify("[Some] _ Article's Title--") some-articles-title """ # "[Some] _ Article's Title--" # "[some] _ article's title--" s = s.lower() # "[some] _ article's_title--" # "[some]___article's_title__" for c in [' ', '-', '.', '/']: s = s.replace(c, '_') # "[some]___article's_title__" # "some___articles_title__" s = re.sub('\W', '', s) # "some___articles_title__" # "some articles title " s = s.replace('_', ' ') # "some articles title " # "some articles title " s = re.sub('\s+', ' ', s) # "some articles title " # "some articles title" s = s.strip() # "some articles title" # "some-articles-title" s = s.replace(' ', '-') return s def pprint_list(l): s = "" for i in l: s = s + str(i) + ", " return s @cli.command() @click.argument('uri') @click.option('--save', is_flag=True) def scrape(uri, save): # pack = snarf(uri) if save: svsname = slugify(pack[0]) + '.txt' fp = codecs.open(svsname, "w", 'utf-8') fp.write(pack[1]) fp.close() else: click.echo_via_pager(pack[1]) def snarf(uri): target = newspaper.Article(uri) target.download() target.parse() header = target.title + "\n" + "By " + pprint_list(target.authors) header = header + "\n\n" header = header + str(target.publish_date) + "\nFrom: " + uri header = header + "\n\n" fp = StringIO() fp.write(header) sep = '-' * 32 + "\n\n" fp.write(sep) fp.write(target.text) return (target.title, fp.getvalue()) if __name__ == '__main__': cli()