Add crawler api code

2020-12-31 15:57:08 -05:00 · 2020-12-31 15:57:08 -05:00 · 5a4425b644
parent 43ce9c5e1d
commit 5a4425b644
1 changed files with 127 additions and 0 deletions
--- a/snarfbot/crawlerapi.py
+++ b/snarfbot/crawlerapi.py
@ -0,0 +1,127 @@
+import logging
+import threading
+import time
+import tldextract
+import os, os.path
+import codecs
+from snarfbot.snarf3k import slugify, snarf
+from queue import Queue
+from snarfbot.linkview import linkview
+
+"""
+Module contains the bulk of the crawler code this is done with two types of thread
+One that performs the actual crawling through links and one that extracts and saves data 
+to disk. These are implemented in crawl, and extract functions
+"""
+
+class StateBox:
+    """
+    Statebox is a thread-safe (i hope ), data structure for communication between 
+    the crawler and the extractor threads. This holds both shared metadata, and the set of
+    sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
+    Note that this is most likely a bad design; and proper typed message queues would 
+    be the computer sciencey way of handling this. So this api might want to change
+    or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
+    """
+
+    def __init__(self, origin, inital_list=[], sameorigin=True):
+        self.starturl = tldextract.extract(origin)
+        self.origin = origin
+        self.norecursive = sameorigin
+        self.lock = threading.Lock()
+        self.seen = set()
+        for i in inital_list:
+            self.seen.add(i)
+
+    def add(self, uri):
+        self.lock.acquire()
+        try:
+            self.seen.add(uri)
+        finally:
+            self.lock.release()
+
+    def delete(self, uri):
+        """
+        docstring
+        """
+        self.lock.acquire()
+        try:
+            if uri in self.seen():
+                self.seen.remove(uri)
+        finally:
+            self.lock.release()
+
+    def seenthis(self, uri):
+        return uri in self.seen
+
+    def okcrawl(self, uri):
+        """
+        docstring
+        """
+        ext = tldextract.extract(uri)
+        if not self.norecursive:
+            return True
+        if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
+            return True
+        else:
+            return False
+
+
+_end = object()
+
+def crawler(q, sb):
+
+    links = linkview(sb.origin)
+    print("Nlinks stage 1: " + str(len(links)))
+    for i in links:
+        q.put(i)
+    if sb.norecursive == True:
+        q.put(_end)
+    else:
+        ## FIXME: Replace with proper recursive algorithm when 
+        ## feature complete 
+        for i in links:
+            print(str(q.qsize()))
+            nthdegree = linkview(i)
+            for x in nthdegree:
+                q.put(x)
+    q.put(_end) # extractor should not need this but we will do it anyway.
+
+
+def extractor(q, sb):
+
+    while not q.empty():
+        basedir = os.getcwd()
+        task = q.get()
+        if task is _end:
+            os.chdir(basedir)
+            break
+        
+        else:
+            if sb.seenthis(task) or not sb.okcrawl(task):
+                q.task_done()
+                continue
+            etd = tldextract.extract(task)
+            dumppath = os.path.join(basedir, etd.registered_domain)
+            if os.path.exists(dumppath) and os.path.isdir(dumppath):
+                os.chdir(dumppath)
+            else:
+                os.mkdir(dumppath)
+                os.chdir(dumppath)
+
+            pack = snarf(task)
+            svsname = slugify(pack[0]) + '.txt'
+            fp = codecs.open(svsname, "w", 'utf-8')
+            fp.write(pack[1])
+            fp.close()
+            os.chdir(basedir)
+            sb.add(task)
+            q.task_done()
+
+            
+
+
+
+ 
+
+