96 lines
2.0 KiB
Python
96 lines
2.0 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
# web2text: a dead simple script to get the important stuff
|
||
|
# From a website.
|
||
|
|
||
|
# As usual the WTFPL applies
|
||
|
import sys
|
||
|
import codecs
|
||
|
import newspaper
|
||
|
import click
|
||
|
import re
|
||
|
import os
|
||
|
from io import StringIO
|
||
|
|
||
|
@click.group()
|
||
|
def cli():
|
||
|
pass
|
||
|
|
||
|
def slugify(s):
|
||
|
"""
|
||
|
Simplifies ugly strings into something URL-friendly.
|
||
|
|
||
|
>>> print slugify("[Some] _ Article's Title--")
|
||
|
some-articles-title
|
||
|
|
||
|
"""
|
||
|
|
||
|
# "[Some] _ Article's Title--"
|
||
|
# "[some] _ article's title--"
|
||
|
s = s.lower()
|
||
|
|
||
|
# "[some] _ article's_title--"
|
||
|
# "[some]___article's_title__"
|
||
|
for c in [' ', '-', '.', '/']:
|
||
|
s = s.replace(c, '_')
|
||
|
|
||
|
# "[some]___article's_title__"
|
||
|
# "some___articles_title__"
|
||
|
s = re.sub('\W', '', s)
|
||
|
|
||
|
# "some___articles_title__"
|
||
|
# "some articles title "
|
||
|
s = s.replace('_', ' ')
|
||
|
|
||
|
# "some articles title "
|
||
|
# "some articles title "
|
||
|
s = re.sub('\s+', ' ', s)
|
||
|
|
||
|
# "some articles title "
|
||
|
# "some articles title"
|
||
|
s = s.strip()
|
||
|
|
||
|
# "some articles title"
|
||
|
# "some-articles-title"
|
||
|
s = s.replace(' ', '-')
|
||
|
|
||
|
return s
|
||
|
|
||
|
def pprint_list(l):
|
||
|
s = ""
|
||
|
for i in l:
|
||
|
s = s + str(i) + ", "
|
||
|
return s
|
||
|
|
||
|
@cli.command()
|
||
|
@click.argument('uri')
|
||
|
@click.option('--save', is_flag=True)
|
||
|
def scrape(uri, save):
|
||
|
#
|
||
|
pack = snarf(uri)
|
||
|
if save:
|
||
|
svsname = slugify(pack[0]) + '.txt'
|
||
|
fp = codecs.open(svsname, "w", 'utf-8')
|
||
|
fp.write(pack[1])
|
||
|
fp.close()
|
||
|
else:
|
||
|
click.echo_via_pager(pack[1])
|
||
|
|
||
|
def snarf(uri):
|
||
|
target = newspaper.Article(uri)
|
||
|
target.download()
|
||
|
target.parse()
|
||
|
header = target.title + "\n" + "By " + pprint_list(target.authors)
|
||
|
header = header + "\n\n"
|
||
|
header = header + str(target.publish_date) + "\nFrom: " + uri
|
||
|
header = header + "\n\n"
|
||
|
fp = StringIO()
|
||
|
fp.write(header)
|
||
|
sep = '-' * 32 + "\n\n"
|
||
|
fp.write(sep)
|
||
|
fp.write(target.text)
|
||
|
return (target.title, fp.getvalue())
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
cli()
|