Add crawler api code

This commit is contained in:
Matt Arnold 2020-12-31 15:57:08 -05:00
parent 43ce9c5e1d
commit 5a4425b644
1 changed files with 127 additions and 0 deletions

127
snarfbot/crawlerapi.py Normal file
View File

@ -0,0 +1,127 @@
import logging
import threading
import time
import tldextract
import os, os.path
import codecs
from snarfbot.snarf3k import slugify, snarf
from queue import Queue
from snarfbot.linkview import linkview
"""
Module contains the bulk of the crawler code this is done with two types of thread
One that performs the actual crawling through links and one that extracts and saves data
to disk. These are implemented in crawl, and extract functions
"""
class StateBox:
"""
Statebox is a thread-safe (i hope ), data structure for communication between
the crawler and the extractor threads. This holds both shared metadata, and the set of
sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
Note that this is most likely a bad design; and proper typed message queues would
be the computer sciencey way of handling this. So this api might want to change
or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
"""
def __init__(self, origin, inital_list=[], sameorigin=True):
self.starturl = tldextract.extract(origin)
self.origin = origin
self.norecursive = sameorigin
self.lock = threading.Lock()
self.seen = set()
for i in inital_list:
self.seen.add(i)
def add(self, uri):
self.lock.acquire()
try:
self.seen.add(uri)
finally:
self.lock.release()
def delete(self, uri):
"""
docstring
"""
self.lock.acquire()
try:
if uri in self.seen():
self.seen.remove(uri)
finally:
self.lock.release()
def seenthis(self, uri):
return uri in self.seen
def okcrawl(self, uri):
"""
docstring
"""
ext = tldextract.extract(uri)
if not self.norecursive:
return True
if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
return True
else:
return False
_end = object()
def crawler(q, sb):
links = linkview(sb.origin)
print("Nlinks stage 1: " + str(len(links)))
for i in links:
q.put(i)
if sb.norecursive == True:
q.put(_end)
else:
## FIXME: Replace with proper recursive algorithm when
## feature complete
for i in links:
print(str(q.qsize()))
nthdegree = linkview(i)
for x in nthdegree:
q.put(x)
q.put(_end) # extractor should not need this but we will do it anyway.
def extractor(q, sb):
while not q.empty():
basedir = os.getcwd()
task = q.get()
if task is _end:
os.chdir(basedir)
break
else:
if sb.seenthis(task) or not sb.okcrawl(task):
q.task_done()
continue
etd = tldextract.extract(task)
dumppath = os.path.join(basedir, etd.registered_domain)
if os.path.exists(dumppath) and os.path.isdir(dumppath):
os.chdir(dumppath)
else:
os.mkdir(dumppath)
os.chdir(dumppath)
pack = snarf(task)
svsname = slugify(pack[0]) + '.txt'
fp = codecs.open(svsname, "w", 'utf-8')
fp.write(pack[1])
fp.close()
os.chdir(basedir)
sb.add(task)
q.task_done()