Inital Commit
This commit is contained in:
commit
352709b82c
|
@ -0,0 +1,4 @@
|
|||
# Snarfbot
|
||||
|
||||
This will eventually be a web crawler that saves websites in plaintext files.
|
||||
For now please enjoy a few cli tools, written as POC. Comments, compliments, complaints, and pull requests accepted.
|
|
@ -0,0 +1,27 @@
|
|||
beautifulsoup4==4.9.3
|
||||
certifi==2020.12.5
|
||||
chardet==4.0.0
|
||||
click==7.1.2
|
||||
cssselect==1.1.0
|
||||
feedfinder2==0.0.4
|
||||
feedparser==6.0.2
|
||||
filelock==3.0.12
|
||||
idna==2.10
|
||||
jieba3k==0.35.1
|
||||
joblib==1.0.0
|
||||
lxml==4.6.2
|
||||
newspaper3k==0.2.8
|
||||
nltk==3.5
|
||||
Pillow==8.0.1
|
||||
python-dateutil==2.8.1
|
||||
PyYAML==5.3.1
|
||||
regex==2020.11.13
|
||||
requests==2.25.1
|
||||
requests-file==1.5.1
|
||||
sgmllib3k==1.0.0
|
||||
six==1.15.0
|
||||
soupsieve==2.1
|
||||
tinysegmenter==0.3
|
||||
tldextract==3.1.0
|
||||
tqdm==4.55.0
|
||||
urllib3==1.26.2
|
|
@ -0,0 +1,95 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# web2text: a dead simple script to get the important stuff
|
||||
# From a website.
|
||||
|
||||
# As usual the WTFPL applies
|
||||
import sys
|
||||
import codecs
|
||||
import newspaper
|
||||
import click
|
||||
import re
|
||||
import os
|
||||
from io import StringIO
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
def slugify(s):
|
||||
"""
|
||||
Simplifies ugly strings into something URL-friendly.
|
||||
|
||||
>>> print slugify("[Some] _ Article's Title--")
|
||||
some-articles-title
|
||||
|
||||
"""
|
||||
|
||||
# "[Some] _ Article's Title--"
|
||||
# "[some] _ article's title--"
|
||||
s = s.lower()
|
||||
|
||||
# "[some] _ article's_title--"
|
||||
# "[some]___article's_title__"
|
||||
for c in [' ', '-', '.', '/']:
|
||||
s = s.replace(c, '_')
|
||||
|
||||
# "[some]___article's_title__"
|
||||
# "some___articles_title__"
|
||||
s = re.sub('\W', '', s)
|
||||
|
||||
# "some___articles_title__"
|
||||
# "some articles title "
|
||||
s = s.replace('_', ' ')
|
||||
|
||||
# "some articles title "
|
||||
# "some articles title "
|
||||
s = re.sub('\s+', ' ', s)
|
||||
|
||||
# "some articles title "
|
||||
# "some articles title"
|
||||
s = s.strip()
|
||||
|
||||
# "some articles title"
|
||||
# "some-articles-title"
|
||||
s = s.replace(' ', '-')
|
||||
|
||||
return s
|
||||
|
||||
def pprint_list(l):
|
||||
s = ""
|
||||
for i in l:
|
||||
s = s + str(i) + ", "
|
||||
return s
|
||||
|
||||
@cli.command()
|
||||
@click.argument('uri')
|
||||
@click.option('--save', is_flag=True)
|
||||
def scrape(uri, save):
|
||||
#
|
||||
pack = snarf(uri)
|
||||
if save:
|
||||
svsname = slugify(pack[0]) + '.txt'
|
||||
fp = codecs.open(svsname, "w", 'utf-8')
|
||||
fp.write(pack[1])
|
||||
fp.close()
|
||||
else:
|
||||
click.echo_via_pager(pack[1])
|
||||
|
||||
def snarf(uri):
|
||||
target = newspaper.Article(uri)
|
||||
target.download()
|
||||
target.parse()
|
||||
header = target.title + "\n" + "By " + pprint_list(target.authors)
|
||||
header = header + "\n\n"
|
||||
header = header + str(target.publish_date) + "\nFrom: " + uri
|
||||
header = header + "\n\n"
|
||||
fp = StringIO()
|
||||
fp.write(header)
|
||||
sep = '-' * 32 + "\n\n"
|
||||
fp.write(sep)
|
||||
fp.write(target.text)
|
||||
return (target.title, fp.getvalue())
|
||||
|
||||
if __name__ == '__main__':
|
||||
cli()
|
Loading…
Reference in New Issue