* Set a proper user agent, and add a safety valve so infinate recursions don't happen

This commit is contained in:
Matt Arnold 2021-01-01 00:27:17 -05:00
parent 0f6bb97425
commit f0caf13470
2 changed files with 10 additions and 5 deletions

View File

@ -33,7 +33,7 @@ class StateBox:
self.lock = threading.Lock()
self.seen = set()
self.mark = set()
self.levels = levels
self.maxlevel = levels
for i in inital_seen:
self.seen.add(i)
@ -103,7 +103,7 @@ _end = object()
def crawler(q, sb):
level = 0
links = linkview(sb.origin)
print("Nlinks stage 1: " + str(len(links)))
for i in links:
@ -111,13 +111,17 @@ def crawler(q, sb):
sb.mark_add(i)
# FIXME: Replace with proper recursive algorithm when
# feature complete
level += 1
for i in links:
print(str(q.qsize()))
if not level < sb.maxlevel:
break
nthdegree = linkview(i)
for x in nthdegree:
if sb.okcrawl(x) and not sb.marked(x):
q.put(x)
sb.mark_add(x)
level += 1
q.put(_end) # extractor should not need this but we will do it anyway.

View File

@ -1,9 +1,10 @@
from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
def linkview(url):
html_page = urllib2.urlopen(url)
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
def linkview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
links = []