* Set a proper user agent, and add a safety valve so infinate recursions don't happen
This commit is contained in:
parent
0f6bb97425
commit
f0caf13470
|
@ -33,7 +33,7 @@ class StateBox:
|
|||
self.lock = threading.Lock()
|
||||
self.seen = set()
|
||||
self.mark = set()
|
||||
self.levels = levels
|
||||
self.maxlevel = levels
|
||||
for i in inital_seen:
|
||||
self.seen.add(i)
|
||||
|
||||
|
@ -103,7 +103,7 @@ _end = object()
|
|||
|
||||
def crawler(q, sb):
|
||||
|
||||
|
||||
level = 0
|
||||
links = linkview(sb.origin)
|
||||
print("Nlinks stage 1: " + str(len(links)))
|
||||
for i in links:
|
||||
|
@ -111,13 +111,17 @@ def crawler(q, sb):
|
|||
sb.mark_add(i)
|
||||
# FIXME: Replace with proper recursive algorithm when
|
||||
# feature complete
|
||||
level += 1
|
||||
for i in links:
|
||||
print(str(q.qsize()))
|
||||
if not level < sb.maxlevel:
|
||||
break
|
||||
nthdegree = linkview(i)
|
||||
for x in nthdegree:
|
||||
if sb.okcrawl(x) and not sb.marked(x):
|
||||
q.put(x)
|
||||
sb.mark_add(x)
|
||||
level += 1
|
||||
q.put(_end) # extractor should not need this but we will do it anyway.
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import urllib.request as urllib2
|
||||
import re
|
||||
|
||||
def linkview(url):
|
||||
html_page = urllib2.urlopen(url)
|
||||
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||
def linkview(url, user_agent=DEFAULT_USER_AGENT):
|
||||
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
|
||||
html_page = urllib2.urlopen(req)
|
||||
soup = BeautifulSoup(html_page, features="lxml")
|
||||
links = []
|
||||
|
||||
|
|
Loading…
Reference in New Issue