Add crawler api code
This commit is contained in:
parent
43ce9c5e1d
commit
5a4425b644
|
@ -0,0 +1,127 @@
|
|||
import logging
|
||||
import threading
|
||||
import time
|
||||
import tldextract
|
||||
import os, os.path
|
||||
import codecs
|
||||
from snarfbot.snarf3k import slugify, snarf
|
||||
from queue import Queue
|
||||
from snarfbot.linkview import linkview
|
||||
|
||||
"""
|
||||
Module contains the bulk of the crawler code this is done with two types of thread
|
||||
One that performs the actual crawling through links and one that extracts and saves data
|
||||
to disk. These are implemented in crawl, and extract functions
|
||||
"""
|
||||
|
||||
class StateBox:
|
||||
"""
|
||||
Statebox is a thread-safe (i hope ), data structure for communication between
|
||||
the crawler and the extractor threads. This holds both shared metadata, and the set of
|
||||
sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
|
||||
Note that this is most likely a bad design; and proper typed message queues would
|
||||
be the computer sciencey way of handling this. So this api might want to change
|
||||
or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
|
||||
"""
|
||||
|
||||
def __init__(self, origin, inital_list=[], sameorigin=True):
|
||||
self.starturl = tldextract.extract(origin)
|
||||
self.origin = origin
|
||||
self.norecursive = sameorigin
|
||||
self.lock = threading.Lock()
|
||||
self.seen = set()
|
||||
for i in inital_list:
|
||||
self.seen.add(i)
|
||||
|
||||
def add(self, uri):
|
||||
self.lock.acquire()
|
||||
try:
|
||||
self.seen.add(uri)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def delete(self, uri):
|
||||
"""
|
||||
docstring
|
||||
"""
|
||||
self.lock.acquire()
|
||||
try:
|
||||
if uri in self.seen():
|
||||
self.seen.remove(uri)
|
||||
finally:
|
||||
self.lock.release()
|
||||
|
||||
def seenthis(self, uri):
|
||||
return uri in self.seen
|
||||
|
||||
def okcrawl(self, uri):
|
||||
"""
|
||||
docstring
|
||||
"""
|
||||
ext = tldextract.extract(uri)
|
||||
if not self.norecursive:
|
||||
return True
|
||||
if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
_end = object()
|
||||
|
||||
def crawler(q, sb):
|
||||
|
||||
links = linkview(sb.origin)
|
||||
print("Nlinks stage 1: " + str(len(links)))
|
||||
for i in links:
|
||||
q.put(i)
|
||||
if sb.norecursive == True:
|
||||
q.put(_end)
|
||||
else:
|
||||
## FIXME: Replace with proper recursive algorithm when
|
||||
## feature complete
|
||||
for i in links:
|
||||
print(str(q.qsize()))
|
||||
nthdegree = linkview(i)
|
||||
for x in nthdegree:
|
||||
q.put(x)
|
||||
q.put(_end) # extractor should not need this but we will do it anyway.
|
||||
|
||||
|
||||
def extractor(q, sb):
|
||||
|
||||
while not q.empty():
|
||||
basedir = os.getcwd()
|
||||
task = q.get()
|
||||
if task is _end:
|
||||
os.chdir(basedir)
|
||||
break
|
||||
|
||||
else:
|
||||
if sb.seenthis(task) or not sb.okcrawl(task):
|
||||
q.task_done()
|
||||
continue
|
||||
etd = tldextract.extract(task)
|
||||
dumppath = os.path.join(basedir, etd.registered_domain)
|
||||
if os.path.exists(dumppath) and os.path.isdir(dumppath):
|
||||
os.chdir(dumppath)
|
||||
else:
|
||||
os.mkdir(dumppath)
|
||||
os.chdir(dumppath)
|
||||
|
||||
pack = snarf(task)
|
||||
svsname = slugify(pack[0]) + '.txt'
|
||||
fp = codecs.open(svsname, "w", 'utf-8')
|
||||
fp.write(pack[1])
|
||||
fp.close()
|
||||
os.chdir(basedir)
|
||||
sb.add(task)
|
||||
q.task_done()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue