From 5a4425b6447d99693717be8c01418f58eb5e1457 Mon Sep 17 00:00:00 2001 From: Matt Arnold Date: Thu, 31 Dec 2020 15:57:08 -0500 Subject: [PATCH] Add crawler api code --- snarfbot/crawlerapi.py | 127 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 snarfbot/crawlerapi.py diff --git a/snarfbot/crawlerapi.py b/snarfbot/crawlerapi.py new file mode 100644 index 0000000..439549c --- /dev/null +++ b/snarfbot/crawlerapi.py @@ -0,0 +1,127 @@ +import logging +import threading +import time +import tldextract +import os, os.path +import codecs +from snarfbot.snarf3k import slugify, snarf +from queue import Queue +from snarfbot.linkview import linkview + +""" +Module contains the bulk of the crawler code this is done with two types of thread +One that performs the actual crawling through links and one that extracts and saves data +to disk. These are implemented in crawl, and extract functions +""" + +class StateBox: + """ + Statebox is a thread-safe (i hope ), data structure for communication between + the crawler and the extractor threads. This holds both shared metadata, and the set of + sites which have already been visited, and parsed. So that infinate crawls queues are avoided. + Note that this is most likely a bad design; and proper typed message queues would + be the computer sciencey way of handling this. So this api might want to change + or be deleted in the future. If this becomes a thing beyond saving all the fanfiction. + """ + + def __init__(self, origin, inital_list=[], sameorigin=True): + self.starturl = tldextract.extract(origin) + self.origin = origin + self.norecursive = sameorigin + self.lock = threading.Lock() + self.seen = set() + for i in inital_list: + self.seen.add(i) + + def add(self, uri): + self.lock.acquire() + try: + self.seen.add(uri) + finally: + self.lock.release() + + def delete(self, uri): + """ + docstring + """ + self.lock.acquire() + try: + if uri in self.seen(): + self.seen.remove(uri) + finally: + self.lock.release() + + def seenthis(self, uri): + return uri in self.seen + + def okcrawl(self, uri): + """ + docstring + """ + ext = tldextract.extract(uri) + if not self.norecursive: + return True + if ext.registered_domain == self.starturl.registered_domain and self.norecursive: + return True + else: + return False + + +_end = object() + +def crawler(q, sb): + + links = linkview(sb.origin) + print("Nlinks stage 1: " + str(len(links))) + for i in links: + q.put(i) + if sb.norecursive == True: + q.put(_end) + else: + ## FIXME: Replace with proper recursive algorithm when + ## feature complete + for i in links: + print(str(q.qsize())) + nthdegree = linkview(i) + for x in nthdegree: + q.put(x) + q.put(_end) # extractor should not need this but we will do it anyway. + + +def extractor(q, sb): + + while not q.empty(): + basedir = os.getcwd() + task = q.get() + if task is _end: + os.chdir(basedir) + break + + else: + if sb.seenthis(task) or not sb.okcrawl(task): + q.task_done() + continue + etd = tldextract.extract(task) + dumppath = os.path.join(basedir, etd.registered_domain) + if os.path.exists(dumppath) and os.path.isdir(dumppath): + os.chdir(dumppath) + else: + os.mkdir(dumppath) + os.chdir(dumppath) + + pack = snarf(task) + svsname = slugify(pack[0]) + '.txt' + fp = codecs.open(svsname, "w", 'utf-8') + fp.write(pack[1]) + fp.close() + os.chdir(basedir) + sb.add(task) + q.task_done() + + + + + + + +