56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import requests
|
|
import yaml
|
|
import re
|
|
from datetime import datetime
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
config = yaml.full_load(open("config.yaml"))
|
|
conn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
|
|
raw_proxies = []
|
|
scraped_proxies = []
|
|
|
|
def check_site(site, data):
|
|
text = ""
|
|
print(site)
|
|
ua = {"User-Agent": config["user_agent"]}
|
|
if data.get("replace"):
|
|
for sub in data["replace"]:
|
|
try:
|
|
r = requests.get(data["url"].format(sub), headers=ua)
|
|
text += r.text
|
|
except:
|
|
print(f"{data['url'].format(sub)} had an error")
|
|
else:
|
|
try:
|
|
r = requests.get(data["url"], headers=ua)
|
|
text += r.text
|
|
except:
|
|
print(f"{data['url']} had an error")
|
|
|
|
for proxy in re.finditer(re.compile(data['regex'], re.MULTILINE | re.DOTALL), text):
|
|
ip = proxy.group('ip')
|
|
port = int(proxy.group('port'))
|
|
if data['proxy_type'] == 'dynamic':
|
|
proxy_type = proxy.group('proxy_type')
|
|
else:
|
|
proxy_type = data["proxy_type"]
|
|
scraped_proxies.append((ip, port, proxy_type, datetime.now(), f"From {site}"))
|
|
|
|
with conn:
|
|
with conn.cursor() as curs:
|
|
for site, data in config["regex_sites"].items():
|
|
check_site(site, data)
|
|
scraped_ips = []
|
|
[scraped_ips.append(p[0]) for p in scraped_proxies]
|
|
curs.execute("select ip, port from proxies where host(ip)=ANY(%s)", (scraped_ips,))
|
|
current_proxies = list(curs.fetchall())
|
|
submitted_proxies = []
|
|
for scraped_proxy in scraped_proxies:
|
|
ip_port = (scraped_proxy[0], scraped_proxy[1])
|
|
if ip_port not in current_proxies:
|
|
submitted_proxies.append(scraped_proxy)
|
|
current_proxies.append(ip_port)
|
|
query = "insert into proxies (ip, port, proxy_type, submitted_at, comment) values %s"
|
|
psycopg2.extras.execute_values(curs, query, submitted_proxies)
|
|
print(f"submitted {len(submitted_proxies)} of {len(scraped_proxies)}") |