Refactor in preperation building the crawler

This commit is contained in:
Matt Arnold 2020-12-30 21:46:12 -05:00
parent 352709b82c
commit 43ce9c5e1d
5 changed files with 233 additions and 65 deletions

146
.gitignore vendored Normal file
View File

@ -0,0 +1,146 @@
*.bak
*.save
.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# static files generated from Django application using `collectstatic`
media
static

7
snarfbot/__init__.py Normal file
View File

@ -0,0 +1,7 @@
"""
snarfbot package the idea with this refactor is hide the underlaying
details of the scraping engine used from api consumers. So that we can replace/
have different ones it if needed
it also lets me work on the crawler without breaking what we've got already which works
surprisingly well for a sub ~100 line program
"""

13
snarfbot/linkview.py Normal file
View File

@ -0,0 +1,13 @@
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
def linkview(url):
html_page = urllib2.urlopen(url)
soup = BeautifulSoup(html_page, features="lxml")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
links.append(link.get('href'))
return links

66
snarfbot/snarf3k.py Normal file
View File

@ -0,0 +1,66 @@
import newspaper
from io import StringIO
import re
def slugify(s):
"""
Simplifies ugly strings into something URL-friendly.
>>> print slugify("[Some] _ Article's Title--")
some-articles-title
"""
# "[Some] _ Article's Title--"
# "[some] _ article's title--"
s = s.lower()
# "[some] _ article's_title--"
# "[some]___article's_title__"
for c in [' ', '-', '.', '/']:
s = s.replace(c, '_')
# "[some]___article's_title__"
# "some___articles_title__"
s = re.sub('\W', '', s)
# "some___articles_title__"
# "some articles title "
s = s.replace('_', ' ')
# "some articles title "
# "some articles title "
s = re.sub('\s+', ' ', s)
# "some articles title "
# "some articles title"
s = s.strip()
# "some articles title"
# "some-articles-title"
s = s.replace(' ', '-')
return s
def pprint_list(l):
s = ""
for i in l:
s = s + str(i) + ", "
return s
def snarf(uri):
target = newspaper.Article(uri)
target.download()
target.parse()
header = target.title + "\n" + "By " + pprint_list(target.authors)
header = header + "\n\n"
header = header + str(target.publish_date) + "\nFrom: " + uri
header = header + "\n\n"
fp = StringIO()
fp.write(header)
sep = '-' * 32 + "\n\n"
fp.write(sep)
fp.write(target.text)
return (target.title, fp.getvalue())

View File

@ -4,63 +4,14 @@
# From a website.
# As usual the WTFPL applies
import sys
import codecs
import newspaper
import click
import re
import os
from io import StringIO
from snarfbot.snarf3k import snarf, slugify
@click.group()
def cli():
pass
def slugify(s):
"""
Simplifies ugly strings into something URL-friendly.
>>> print slugify("[Some] _ Article's Title--")
some-articles-title
"""
# "[Some] _ Article's Title--"
# "[some] _ article's title--"
s = s.lower()
# "[some] _ article's_title--"
# "[some]___article's_title__"
for c in [' ', '-', '.', '/']:
s = s.replace(c, '_')
# "[some]___article's_title__"
# "some___articles_title__"
s = re.sub('\W', '', s)
# "some___articles_title__"
# "some articles title "
s = s.replace('_', ' ')
# "some articles title "
# "some articles title "
s = re.sub('\s+', ' ', s)
# "some articles title "
# "some articles title"
s = s.strip()
# "some articles title"
# "some-articles-title"
s = s.replace(' ', '-')
return s
def pprint_list(l):
s = ""
for i in l:
s = s + str(i) + ", "
return s
@cli.command()
@click.argument('uri')
@ -76,20 +27,5 @@ def scrape(uri, save):
else:
click.echo_via_pager(pack[1])
def snarf(uri):
target = newspaper.Article(uri)
target.download()
target.parse()
header = target.title + "\n" + "By " + pprint_list(target.authors)
header = header + "\n\n"
header = header + str(target.publish_date) + "\nFrom: " + uri
header = header + "\n\n"
fp = StringIO()
fp.write(header)
sep = '-' * 32 + "\n\n"
fp.write(sep)
fp.write(target.text)
return (target.title, fp.getvalue())
if __name__ == '__main__':
cli()