snarfbot/snarfbot/snarf3k.py

66 lines
1.4 KiB
Python

import newspaper
from io import StringIO
import re
def slugify(s):
"""
Simplifies ugly strings into something URL-friendly.
>>> print slugify("[Some] _ Article's Title--")
some-articles-title
"""
# "[Some] _ Article's Title--"
# "[some] _ article's title--"
s = s.lower()
# "[some] _ article's_title--"
# "[some]___article's_title__"
for c in [' ', '-', '.', '/']:
s = s.replace(c, '_')
# "[some]___article's_title__"
# "some___articles_title__"
s = re.sub('\W', '', s)
# "some___articles_title__"
# "some articles title "
s = s.replace('_', ' ')
# "some articles title "
# "some articles title "
s = re.sub('\s+', ' ', s)
# "some articles title "
# "some articles title"
s = s.strip()
# "some articles title"
# "some-articles-title"
s = s.replace(' ', '-')
return s
def pprint_list(l):
s = ""
for i in l:
s = s + str(i) + ", "
return s
def snarf(uri):
target = newspaper.Article(uri)
target.download()
target.parse()
header = target.title + "\n" + "By " + pprint_list(target.authors)
header = header + "\n\n"
header = header + str(target.publish_date) + "\nFrom: " + uri
header = header + "\n\n"
fp = StringIO()
fp.write(header)
sep = '-' * 32 + "\n\n"
fp.write(sep)
fp.write(target.text)
return (target.title, fp.getvalue())