diff --git a/README.md b/README.md index 7851ec9..a7b0b0d 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ A Python webapp that "herds" a list of open proxies, with an API for external programs to add and update entries. -This is only the webapp, [openproxyherder-extras](https://git.sr.ht/~emerson/openproxyherder-extras) has some example Python scripts that might be useful to you. +`scripts/` contains some example scripts that might be useful. -## Setup +## Setup openproxyherder This is tested with Python 3.8 and 3.9, but should work with Python 3.5 and above. @@ -31,4 +31,8 @@ Copy `config.example.yaml` to `config.yaml`, edit `config.yaml` as needed. Install the packages in `requirements.txt` (probably a good idea to create a venv). -Run `python openproxyherder.py` \ No newline at end of file +Run `python openproxyherder.py` + +## Scripts + +See README in `scripts/` for information about the scripts \ No newline at end of file diff --git a/config.example.yaml b/config.example.yaml index bce5e66..98c45a4 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -3,6 +3,6 @@ db_port: 5432 db_user: "test" db_name: "openproxyherder" db_password: "test" -host: "127.0.0.1" -port: 8080 +listen_host: "127.0.0.1" +listen_port: 8080 access_log: True diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..bbce828 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,23 @@ +Some of the proxy scrapers and checkers that can be used with the openproxyherder. + +Works with recent py3, make a venv, install requirements.txt, copy `config.example.yaml` to `config.yaml` and change the stuff. + +These are not shining examples of code, they're quick scripts that work (unless they don't work), meant to show how to interact with OPH. + +No, I'm not going to give you my list of proxy sites. Do your own research. + +Also be a good citizen of the internet. + +## Checkers +Scripts that check IPs listed in OPH. + +### http_socks.py +Checks HTTP/SOCKS(4|5) proxies. It attempts to connect to a webpage that consists solely of [connecting ip address] [string defined in `scrapers/config.yaml`]. The one in the example config is a public one you are welcome to use. If you want to use your own, nginx ssi is the easiest way, ` yourproxystring`. + +### vpngate.py +Checks vpngate proxies. Not much else to say here. While technically a VPN, not a proxy, locking the door but leaving the key in the lock is essentially open, and that's what VPNGate does. + +## Gatherers +Scripts that get proxies from various places. + +Note that as of now, some of these dump directly into the database rather than use the OPH API, for historical reasons when OPH didn't use asyncpg and it would lock up when adding hundreds of proxies. This will probably change eventually. \ No newline at end of file diff --git a/scripts/gatherers/config.example.yaml b/scripts/gatherers/config.example.yaml new file mode 100644 index 0000000..43c8795 --- /dev/null +++ b/scripts/gatherers/config.example.yaml @@ -0,0 +1,25 @@ +dbname: openproxyherder +dbuser: openproxyherder +dbpassword: openproxyherder +dbhost: "127.0.0.1" + +# Fake UA for sites that actually check it +user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0" + +# for regex_scraper.py +regex_sites: + # this key will go into the comment fields + "aliveproxy.com": + # Type of proxy, this goes directly into `proxy_type`. 'dynamic' will pull the type from the proxy_type named capture group + proxy_type: 'dynamic' + # if `replace` is specified, then this is an f-string, and iterates through the `replace` values and fetches all the resulting URLs. + url: "http://aliveproxy.com/proxy-list-port-{}" + # if proxy_type is dynamic, the proxy_type specifies what proxy_type to use + regex: '(?P[^:]+)://(?P[\d\.]+):(?P\d+)' + # optional, nice to have for sites with multiple pages of proxies + replace: + - "80" + - "81" + - "3128" + - "8000" + - "8080" \ No newline at end of file diff --git a/scripts/gatherers/regex_scraper.py b/scripts/gatherers/regex_scraper.py new file mode 100644 index 0000000..b29b4ec --- /dev/null +++ b/scripts/gatherers/regex_scraper.py @@ -0,0 +1,56 @@ +import requests +import yaml +import re +from datetime import datetime +import psycopg2 +import psycopg2.extras + +config = yaml.full_load(open("config.yaml")) +conn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"]) +raw_proxies = [] +scraped_proxies = [] + +def check_site(site, data): + text = "" + print(site) + ua = {"User-Agent": config["user_agent"]} + if data.get("replace"): + for sub in data["replace"]: + try: + r = requests.get(data["url"].format(sub), headers=ua) + text += r.text + except: + print(f"{data['url'].format(sub)} had an error") + else: + try: + r = requests.get(data["url"], headers=ua) + text += r.text + except: + print(f"{data['url']} had an error") + + for proxy in re.finditer(re.compile(data['regex'], re.MULTILINE | re.DOTALL), text): + ip = proxy.group('ip') + port = int(proxy.group('port')) + if data['proxy_type'] == 'dynamic': + proxy_type = proxy.group('proxy_type') + else: + proxy_type = data["proxy_type"] + scraped_proxies.append((ip, port, proxy_type, datetime.now(), f"From {site}")) + +with conn: + with conn.cursor() as curs: + for site, data in config["regex_sites"].items(): + check_site(site, data) + scraped_ips = [] + [scraped_ips.append(p[0]) for p in scraped_proxies] + curs.execute("select ip, port from proxies where host(ip)=ANY(%s)", (scraped_ips,)) + current_proxies = list(curs.fetchall()) + submitted_proxies = [] + for scraped_proxy in scraped_proxies: + ip_port = (scraped_proxy[0], scraped_proxy[1]) + if ip_port not in current_proxies: + submitted_proxies.append(scraped_proxy) + current_proxies.append(ip_port) + query = "insert into proxies (ip, port, proxy_type, submitted_at, comment) values %s" + psycopg2.extras.execute_values(curs, query, submitted_proxies) + print(f"submitted {len(submitted_proxies)} of {len(scraped_proxies)}") \ No newline at end of file diff --git a/scripts/gatherers/vpngate_api.py b/scripts/gatherers/vpngate_api.py new file mode 100644 index 0000000..981a604 --- /dev/null +++ b/scripts/gatherers/vpngate_api.py @@ -0,0 +1,38 @@ +import requests +import re +import psycopg2 +import psycopg2.extras +from datetime import datetime +from base64 import b64decode +import yaml + +config = yaml.full_load(open("config.yaml").read()) +pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"]) +r = requests.get("http://www.vpngate.net/api/iphone/", headers={"User-Agent": config["user_agent"]}) +csvfile = r.text.split("\r\n") +scraped_servers = [] +current_servers = [] +with pgconn: + with pgconn.cursor() as curs: + curs.execute("select ip, port from proxies where proxy_type='vpngate'") + current_servers = curs.fetchall() + +for server in csvfile[2:]: + try: + hostname, ip, score, ping, speed, clong, cshort, numsessions, uptime, totalusers, totaltraffic, logtype, operator, message, configdata = server.split(",") + except: + continue + configdataplain = b64decode(configdata) + remote = re.search("remote (\S+) (\d+)", configdataplain.decode()) + if remote: + host = remote[1] + port = int(remote[2]) + if (host, port) not in current_servers: + scraped_servers.append((host, port, host, "vpngate", datetime.now(), "vpngate proxies api")) + +with pgconn: + with pgconn.cursor() as curs: + insert_query = """ + insert into proxies (ip, port, exit_ip, proxy_type, submitted_at, comment) + values %s""" + psycopg2.extras.execute_values(curs, insert_query, scraped_servers) diff --git a/scripts/gatherers/vpngate_other.py b/scripts/gatherers/vpngate_other.py new file mode 100644 index 0000000..b4539d5 --- /dev/null +++ b/scripts/gatherers/vpngate_other.py @@ -0,0 +1,34 @@ +import psycopg2 +import psycopg2.extras +import requests +import re +import yaml +from bs4 import BeautifulSoup +from datetime import datetime +from time import sleep + +config = yaml.full_load(open("config.yaml").read()) +pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"]) +current_servers = [] +with pgconn: + with pgconn.cursor() as curs: + curs.execute("select ip, port from proxies where proxy_type='vpngate'") + current_servers = curs.fetchall() + +proxies = [] +for i in range(1,3): + r = requests.get(f"https://freevpn.gg/?p={i}", headers={"User-Agent": config["user_agent"]}) + data = BeautifulSoup(r.text, features="html5lib") + for ip_data in data.find_all(class_="card-box"): + ip = ip_data.find("h3").text + port = re.search("TCP\((\d+)\)", ip_data.find(class_="card-text").text) + if port and (ip, int(port[1])) not in current_servers: + proxies.append((ip, port[1], ip, "unscanned", "vpngate", datetime.now(), "vpngate proxies freevpn")) + sleep(5) + +with pgconn: + with pgconn.cursor() as curs: + insert_query = """ + insert into proxies (ip, port, exit_ip, status, proxy_type, submitted_at, comment) + values %s""" + psycopg2.extras.execute_values(curs, insert_query, proxies) \ No newline at end of file diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..585a6ab --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,6 @@ +bs4 +html5lib +psycopg2 +pycurl +pyyaml +requests \ No newline at end of file