snarfbot/snarfbot/crawlerapi.py

import logging
import threading
import time
import tldextract
import os
import os.path
import codecs
from snarfbot.snarf3k import slugify, snarf
from queue import Queue
from snarfbot.linkview import linkview

"""
Module contains the bulk of the crawler code this is done with two types of thread
One that performs the actual crawling through links and one that extracts and saves data
to disk. These are implemented in crawl, and extract functions
"""


class StateBox:
    """
    Statebox is a thread-safe (i hope ), data structure for communication between
    the crawler and the extractor threads. This holds both shared metadata, and the set of
    sites which have already been visited, and parsed. So that infinate crawls queues are avoided.
    Note that this is most likely a bad design; and proper typed message queues would
    be the computer sciencey way of handling this. So this api might want to change
    or be deleted in the future. If this becomes a thing beyond saving all the fanfiction.
    """

    def __init__(self, origin, inital_seen=[], sameorigin=True, levels=1):
        self.starturl = tldextract.extract(origin)
        self.origin = origin
        self.norecursive = sameorigin
        self.lock = threading.Lock()
        self.seen = set()
        self.mark = set()
        self.maxlevel = levels
        for i in inital_seen:
            self.seen.add(i)

    def seen_add(self, uri):
        self.lock.acquire()
        try:
            self.seen.add(uri)
        finally:
            self.lock.release()

    def seen_delete(self, uri):
        """
        docstring
        """
        self.lock.acquire()
        try:
            if uri in self.seen:
                self.seen.remove(uri)
        finally:
            self.lock.release()

    def seenthis(self, uri):
        return uri in self.seen

    def mark_add(self, uri):
        """
        docstring
        """
        self.lock.acquire()
        try:
            self.mark.add(uri)
        finally:
            self.lock.release()

    def mark_delete(self, uri):
        """
        docstring
        """
        self.lock.acquire()
        try:
            if uri in self.mark:
                self.mark.remove(uri)
        finally:
            self.lock.release()

    def marked(self, uri):
        """
        docstring
        """
        return uri in self.mark

    def okcrawl(self, uri):
        """
        docstring
        """
        ext = tldextract.extract(uri)
        if not self.norecursive:
            return True
        if ext.registered_domain == self.starturl.registered_domain and self.norecursive:
            return True
        else:
            return False


_end = object()


def crawler(q, sb):

    level = 0
    links = linkview(sb.origin)
    print("Nlinks stage 1: " + str(len(links)))
    for i in links:
        q.put(i)
        sb.mark_add(i)
        # FIXME: Replace with proper recursive algorithm when
        # feature complete
    level += 1
    for i in links:
        print(str(q.qsize()))
        if not level < sb.maxlevel:
            break
        nthdegree = linkview(i)
        for x in nthdegree:
            if sb.okcrawl(x) and not sb.marked(x):
                q.put(x)
                sb.mark_add(x)
            level += 1
    q.put(_end)  # extractor should not need this but we will do it anyway.


def extractor(q, sb):

    while not q.empty():
        basedir = os.getcwd()
        task = q.get()
        if task is _end:
            os.chdir(basedir)
            break

        else:
            if sb.seenthis(task) or not sb.okcrawl(task):
                q.task_done()
                continue
            etd = tldextract.extract(task)
            dumppath = os.path.join(basedir, etd.registered_domain)
            if os.path.exists(dumppath) and os.path.isdir(dumppath):
                os.chdir(dumppath)
            else:
                os.mkdir(dumppath)
                os.chdir(dumppath)

            pack = snarf(task)
            svsname = slugify(pack[0]) + '.txt'
            fp = codecs.open(svsname, "w", 'utf-8')
            fp.write(pack[1])
            fp.close()
            os.chdir(basedir)
            sb.seen_add(task)
            q.task_done()