From f0caf134705878a5a9f1ec4ee4842a4837ae6de7 Mon Sep 17 00:00:00 2001 From: Matt Arnold Date: Fri, 1 Jan 2021 00:27:17 -0500 Subject: [PATCH] * Set a proper user agent, and add a safety valve so infinate recursions don't happen --- snarfbot/crawlerapi.py | 8 ++++++-- snarfbot/linkview.py | 7 ++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/snarfbot/crawlerapi.py b/snarfbot/crawlerapi.py index 0140404..def9bac 100644 --- a/snarfbot/crawlerapi.py +++ b/snarfbot/crawlerapi.py @@ -33,7 +33,7 @@ class StateBox: self.lock = threading.Lock() self.seen = set() self.mark = set() - self.levels = levels + self.maxlevel = levels for i in inital_seen: self.seen.add(i) @@ -103,7 +103,7 @@ _end = object() def crawler(q, sb): - + level = 0 links = linkview(sb.origin) print("Nlinks stage 1: " + str(len(links))) for i in links: @@ -111,13 +111,17 @@ def crawler(q, sb): sb.mark_add(i) # FIXME: Replace with proper recursive algorithm when # feature complete + level += 1 for i in links: print(str(q.qsize())) + if not level < sb.maxlevel: + break nthdegree = linkview(i) for x in nthdegree: if sb.okcrawl(x) and not sb.marked(x): q.put(x) sb.mark_add(x) + level += 1 q.put(_end) # extractor should not need this but we will do it anyway. diff --git a/snarfbot/linkview.py b/snarfbot/linkview.py index 121a020..cba0566 100644 --- a/snarfbot/linkview.py +++ b/snarfbot/linkview.py @@ -1,9 +1,10 @@ from bs4 import BeautifulSoup import urllib.request as urllib2 import re - -def linkview(url): - html_page = urllib2.urlopen(url) +DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0" +def linkview(url, user_agent=DEFAULT_USER_AGENT): + req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent}) + html_page = urllib2.urlopen(req) soup = BeautifulSoup(html_page, features="lxml") links = []