* Set a proper user agent, and add a safety valve so infinate recursions don't happen
This commit is contained in:
parent
0f6bb97425
commit
f0caf13470
|
@ -33,7 +33,7 @@ class StateBox:
|
||||||
self.lock = threading.Lock()
|
self.lock = threading.Lock()
|
||||||
self.seen = set()
|
self.seen = set()
|
||||||
self.mark = set()
|
self.mark = set()
|
||||||
self.levels = levels
|
self.maxlevel = levels
|
||||||
for i in inital_seen:
|
for i in inital_seen:
|
||||||
self.seen.add(i)
|
self.seen.add(i)
|
||||||
|
|
||||||
|
@ -103,7 +103,7 @@ _end = object()
|
||||||
|
|
||||||
def crawler(q, sb):
|
def crawler(q, sb):
|
||||||
|
|
||||||
|
level = 0
|
||||||
links = linkview(sb.origin)
|
links = linkview(sb.origin)
|
||||||
print("Nlinks stage 1: " + str(len(links)))
|
print("Nlinks stage 1: " + str(len(links)))
|
||||||
for i in links:
|
for i in links:
|
||||||
|
@ -111,13 +111,17 @@ def crawler(q, sb):
|
||||||
sb.mark_add(i)
|
sb.mark_add(i)
|
||||||
# FIXME: Replace with proper recursive algorithm when
|
# FIXME: Replace with proper recursive algorithm when
|
||||||
# feature complete
|
# feature complete
|
||||||
|
level += 1
|
||||||
for i in links:
|
for i in links:
|
||||||
print(str(q.qsize()))
|
print(str(q.qsize()))
|
||||||
|
if not level < sb.maxlevel:
|
||||||
|
break
|
||||||
nthdegree = linkview(i)
|
nthdegree = linkview(i)
|
||||||
for x in nthdegree:
|
for x in nthdegree:
|
||||||
if sb.okcrawl(x) and not sb.marked(x):
|
if sb.okcrawl(x) and not sb.marked(x):
|
||||||
q.put(x)
|
q.put(x)
|
||||||
sb.mark_add(x)
|
sb.mark_add(x)
|
||||||
|
level += 1
|
||||||
q.put(_end) # extractor should not need this but we will do it anyway.
|
q.put(_end) # extractor should not need this but we will do it anyway.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import urllib.request as urllib2
|
import urllib.request as urllib2
|
||||||
import re
|
import re
|
||||||
|
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
|
||||||
def linkview(url):
|
def linkview(url, user_agent=DEFAULT_USER_AGENT):
|
||||||
html_page = urllib2.urlopen(url)
|
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
|
||||||
|
html_page = urllib2.urlopen(req)
|
||||||
soup = BeautifulSoup(html_page, features="lxml")
|
soup = BeautifulSoup(html_page, features="lxml")
|
||||||
links = []
|
links = []
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue