snarfbot/snarfbot/linkview.py

14 lines
543 B
Python

from bs4 import BeautifulSoup
import urllib.request as urllib2
import re
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; SnarfBot; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
def linkview(url, user_agent=DEFAULT_USER_AGENT):
req = urllib2.Request(url, data=None, headers={'User-Agent':user_agent})
html_page = urllib2.urlopen(req)
soup = BeautifulSoup(html_page, features="lxml")
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
links.append(link.get('href'))
return links