openproxyherder/scripts/gatherers/regex_scraper.py

56 lines
1.8 KiB
Python

import requests
import yaml
import re
from datetime import datetime
import psycopg2
import psycopg2.extras
config = yaml.full_load(open("config.yaml"))
conn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
raw_proxies = []
scraped_proxies = []
def check_site(site, data):
text = ""
print(site)
ua = {"User-Agent": config["user_agent"]}
if data.get("replace"):
for sub in data["replace"]:
try:
r = requests.get(data["url"].format(sub), headers=ua)
text += r.text
except:
print(f"{data['url'].format(sub)} had an error")
else:
try:
r = requests.get(data["url"], headers=ua)
text += r.text
except:
print(f"{data['url']} had an error")
for proxy in re.finditer(re.compile(data['regex'], re.MULTILINE | re.DOTALL), text):
ip = proxy.group('ip')
port = int(proxy.group('port'))
if data['proxy_type'] == 'dynamic':
proxy_type = proxy.group('proxy_type')
else:
proxy_type = data["proxy_type"]
scraped_proxies.append((ip, port, proxy_type, datetime.now(), f"From {site}"))
with conn:
with conn.cursor() as curs:
for site, data in config["regex_sites"].items():
check_site(site, data)
scraped_ips = []
[scraped_ips.append(p[0]) for p in scraped_proxies]
curs.execute("select ip, port from proxies where host(ip)=ANY(%s)", (scraped_ips,))
current_proxies = list(curs.fetchall())
submitted_proxies = []
for scraped_proxy in scraped_proxies:
ip_port = (scraped_proxy[0], scraped_proxy[1])
if ip_port not in current_proxies:
submitted_proxies.append(scraped_proxy)
current_proxies.append(ip_port)
query = "insert into proxies (ip, port, proxy_type, submitted_at, comment) values %s"
psycopg2.extras.execute_values(curs, query, submitted_proxies)
print(f"submitted {len(submitted_proxies)} of {len(scraped_proxies)}")