Inital Commit

This commit is contained in:
Matt Arnold 2020-12-29 00:06:37 -05:00
commit 352709b82c
3 changed files with 126 additions and 0 deletions

4
README.md Normal file
View File

@ -0,0 +1,4 @@
# Snarfbot
This will eventually be a web crawler that saves websites in plaintext files.
For now please enjoy a few cli tools, written as POC. Comments, compliments, complaints, and pull requests accepted.

27
requirements.txt Normal file
View File

@ -0,0 +1,27 @@
beautifulsoup4==4.9.3
certifi==2020.12.5
chardet==4.0.0
click==7.1.2
cssselect==1.1.0
feedfinder2==0.0.4
feedparser==6.0.2
filelock==3.0.12
idna==2.10
jieba3k==0.35.1
joblib==1.0.0
lxml==4.6.2
newspaper3k==0.2.8
nltk==3.5
Pillow==8.0.1
python-dateutil==2.8.1
PyYAML==5.3.1
regex==2020.11.13
requests==2.25.1
requests-file==1.5.1
sgmllib3k==1.0.0
six==1.15.0
soupsieve==2.1
tinysegmenter==0.3
tldextract==3.1.0
tqdm==4.55.0
urllib3==1.26.2

95
web2text.py Executable file
View File

@ -0,0 +1,95 @@
#!/usr/bin/env python3
# web2text: a dead simple script to get the important stuff
# From a website.
# As usual the WTFPL applies
import sys
import codecs
import newspaper
import click
import re
import os
from io import StringIO
@click.group()
def cli():
pass
def slugify(s):
"""
Simplifies ugly strings into something URL-friendly.
>>> print slugify("[Some] _ Article's Title--")
some-articles-title
"""
# "[Some] _ Article's Title--"
# "[some] _ article's title--"
s = s.lower()
# "[some] _ article's_title--"
# "[some]___article's_title__"
for c in [' ', '-', '.', '/']:
s = s.replace(c, '_')
# "[some]___article's_title__"
# "some___articles_title__"
s = re.sub('\W', '', s)
# "some___articles_title__"
# "some articles title "
s = s.replace('_', ' ')
# "some articles title "
# "some articles title "
s = re.sub('\s+', ' ', s)
# "some articles title "
# "some articles title"
s = s.strip()
# "some articles title"
# "some-articles-title"
s = s.replace(' ', '-')
return s
def pprint_list(l):
s = ""
for i in l:
s = s + str(i) + ", "
return s
@cli.command()
@click.argument('uri')
@click.option('--save', is_flag=True)
def scrape(uri, save):
#
pack = snarf(uri)
if save:
svsname = slugify(pack[0]) + '.txt'
fp = codecs.open(svsname, "w", 'utf-8')
fp.write(pack[1])
fp.close()
else:
click.echo_via_pager(pack[1])
def snarf(uri):
target = newspaper.Article(uri)
target.download()
target.parse()
header = target.title + "\n" + "By " + pprint_list(target.authors)
header = header + "\n\n"
header = header + str(target.publish_date) + "\nFrom: " + uri
header = header + "\n\n"
fp = StringIO()
fp.write(header)
sep = '-' * 32 + "\n\n"
fp.write(sep)
fp.write(target.text)
return (target.title, fp.getvalue())
if __name__ == '__main__':
cli()