157 lines
4.1 KiB
Python
157 lines
4.1 KiB
Python
import logging
|
|
import threading
|
|
import time
|
|
import tldextract
|
|
import os
|
|
import os.path
|
|
import codecs
|
|
from snarfbot.snarf3k import slugify, snarf
|
|
from queue import Queue
|
|
from snarfbot.linkview import linkview
|
|
|
|
"""
|
|
Module contains the bulk of the crawler code this is done with two types of thread
|
|
One that performs the actual crawling through links and one that extracts and saves data
|
|
to disk. These are implemented in crawl, and extract functions
|
|
"""
|
|
|
|
|
|
class StateBox:
|
|
"""
|
|
Statebox is a thread-safe (i hope ), data structure for communication between
|
|
the crawler and the extractor threads. This holds both shared metadata, and the set of
|
|
sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
|
|
Note that this is most likely a bad design; and proper typed message queues would
|
|
be the computer sciencey way of handling this. So this api might want to change
|
|
or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
|
|
"""
|
|
|
|
def __init__(self, origin, inital_seen=[], sameorigin=True, levels=1):
|
|
self.starturl = tldextract.extract(origin)
|
|
self.origin = origin
|
|
self.norecursive = sameorigin
|
|
self.lock = threading.Lock()
|
|
self.seen = set()
|
|
self.mark = set()
|
|
self.maxlevel = levels
|
|
for i in inital_seen:
|
|
self.seen.add(i)
|
|
|
|
def seen_add(self, uri):
|
|
self.lock.acquire()
|
|
try:
|
|
self.seen.add(uri)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def seen_delete(self, uri):
|
|
"""
|
|
docstring
|
|
"""
|
|
self.lock.acquire()
|
|
try:
|
|
if uri in self.seen:
|
|
self.seen.remove(uri)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def seenthis(self, uri):
|
|
return uri in self.seen
|
|
|
|
def mark_add(self, uri):
|
|
"""
|
|
docstring
|
|
"""
|
|
self.lock.acquire()
|
|
try:
|
|
self.mark.add(uri)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def mark_delete(self, uri):
|
|
"""
|
|
docstring
|
|
"""
|
|
self.lock.acquire()
|
|
try:
|
|
if uri in self.mark:
|
|
self.mark.remove(uri)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def marked(self, uri):
|
|
"""
|
|
docstring
|
|
"""
|
|
return uri in self.mark
|
|
|
|
def okcrawl(self, uri):
|
|
"""
|
|
docstring
|
|
"""
|
|
ext = tldextract.extract(uri)
|
|
if not self.norecursive:
|
|
return True
|
|
if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
_end = object()
|
|
|
|
|
|
def crawler(q, sb):
|
|
|
|
level = 0
|
|
links = linkview(sb.origin)
|
|
print("Nlinks stage 1: " + str(len(links)))
|
|
for i in links:
|
|
q.put(i)
|
|
sb.mark_add(i)
|
|
# FIXME: Replace with proper recursive algorithm when
|
|
# feature complete
|
|
level += 1
|
|
for i in links:
|
|
print(str(q.qsize()))
|
|
if not level < sb.maxlevel:
|
|
break
|
|
nthdegree = linkview(i)
|
|
for x in nthdegree:
|
|
if sb.okcrawl(x) and not sb.marked(x):
|
|
q.put(x)
|
|
sb.mark_add(x)
|
|
level += 1
|
|
q.put(_end) # extractor should not need this but we will do it anyway.
|
|
|
|
|
|
def extractor(q, sb):
|
|
|
|
while not q.empty():
|
|
basedir = os.getcwd()
|
|
task = q.get()
|
|
if task is _end:
|
|
os.chdir(basedir)
|
|
break
|
|
|
|
else:
|
|
if sb.seenthis(task) or not sb.okcrawl(task):
|
|
q.task_done()
|
|
continue
|
|
etd = tldextract.extract(task)
|
|
dumppath = os.path.join(basedir, etd.registered_domain)
|
|
if os.path.exists(dumppath) and os.path.isdir(dumppath):
|
|
os.chdir(dumppath)
|
|
else:
|
|
os.mkdir(dumppath)
|
|
os.chdir(dumppath)
|
|
|
|
pack = snarf(task)
|
|
svsname = slugify(pack[0]) + '.txt'
|
|
fp = codecs.open(svsname, "w", 'utf-8')
|
|
fp.write(pack[1])
|
|
fp.close()
|
|
os.chdir(basedir)
|
|
sb.seen_add(task)
|
|
q.task_done()
|