prxCrwl/spider.py

79 lines
2.2 KiB
Python

import os
import re
import subprocess
from subprocess import PIPE
import urllib.parse
import lxml.html as html
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
class Spider():
sel = None
def start_selenium(self):
options = Options()
options.add_argument("--headless")
self.sel = webdriver.Firefox(options=options)
return self.sel
def get(self, url):
if None == self.sel:
self.start_selenium()
self.sel.get(url)
return self.sel.page_source
##################################################
class PListSpider(Spider):
"""This spider crawls https://proxy-list.download/SOCKS5"""
url = "https://proxy-list.download/SOCKS5"
def get_proxies(self):
"""returns lit of (ip, port) tuples"""
page = self.get(self.url)
tree = html.fromstring(page)
proxies = []
for entry in tree.xpath('//tr'):
for e in entry.xpath('./td/text()'):
m = re.search("[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}", e)
if m != None:
ip = m.group(0)
continue
m = re.search("[0-9]{1,5}", e)
if m != None:
port = m.group(0)
proxies.append((ip, port))
break
return proxies
##################################################
def main():
spider = PListSpider()
proxies = spider.get_proxies()
fd = open("proxychains.conf", "r")
conf = fd.read()
fd.close()
for proxy in proxies:
fd = open("tmp.conf", "w+")
fd.write(conf+"\nsocks5 {ip} {port}".format(ip=proxy[0], port=proxy[1]))
fd.close()
cmd = "proxychains -f tmp.conf curl --url https://check.torproject.org"
print (cmd)
cmdOut = subprocess.Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE).communicate()
print (cmdOut[1])
cmdOut = cmdOut[1]
os.remove("tmp.conf")
if not b"timeout!" in cmdOut:
pListFd = open("pList.txt", "a+")
pListFd.write(proxy[0]+","+proxy[1]+"\n")
pListFd.close()
if "__main__" == __name__:
main()