add scripts and fix config

This commit is contained in:
Em 2021-06-23 14:44:52 -04:00
parent 4309c31f73
commit 23e57dfbdf
8 changed files with 191 additions and 5 deletions

View File

@ -2,9 +2,9 @@
A Python webapp that "herds" a list of open proxies, with an API for external programs to add and update entries.
This is only the webapp, [openproxyherder-extras](https://git.sr.ht/~emerson/openproxyherder-extras) has some example Python scripts that might be useful to you.
`scripts/` contains some example scripts that might be useful.
## Setup
## Setup openproxyherder
This is tested with Python 3.8 and 3.9, but should work with Python 3.5 and above.
@ -31,4 +31,8 @@ Copy `config.example.yaml` to `config.yaml`, edit `config.yaml` as needed.
Install the packages in `requirements.txt` (probably a good idea to create a venv).
Run `python openproxyherder.py`
Run `python openproxyherder.py`
## Scripts
See README in `scripts/` for information about the scripts

View File

@ -3,6 +3,6 @@ db_port: 5432
db_user: "test"
db_name: "openproxyherder"
db_password: "test"
host: "127.0.0.1"
port: 8080
listen_host: "127.0.0.1"
listen_port: 8080
access_log: True

23
scripts/README.md Normal file
View File

@ -0,0 +1,23 @@
Some of the proxy scrapers and checkers that can be used with the openproxyherder.
Works with recent py3, make a venv, install requirements.txt, copy `config.example.yaml` to `config.yaml` and change the stuff.
These are not shining examples of code, they're quick scripts that work (unless they don't work), meant to show how to interact with OPH.
No, I'm not going to give you my list of proxy sites. Do your own research.
Also be a good citizen of the internet.
## Checkers
Scripts that check IPs listed in OPH.
### http_socks.py
Checks HTTP/SOCKS(4|5) proxies. It attempts to connect to a webpage that consists solely of [connecting ip address] [string defined in `scrapers/config.yaml`]. The one in the example config is a public one you are welcome to use. If you want to use your own, nginx ssi is the easiest way, `<!--#echo var="REMOTE_ADDR" --> yourproxystring`.
### vpngate.py
Checks vpngate proxies. Not much else to say here. While technically a VPN, not a proxy, locking the door but leaving the key in the lock is essentially open, and that's what VPNGate does.
## Gatherers
Scripts that get proxies from various places.
Note that as of now, some of these dump directly into the database rather than use the OPH API, for historical reasons when OPH didn't use asyncpg and it would lock up when adding hundreds of proxies. This will probably change eventually.

View File

@ -0,0 +1,25 @@
dbname: openproxyherder
dbuser: openproxyherder
dbpassword: openproxyherder
dbhost: "127.0.0.1"
# Fake UA for sites that actually check it
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0"
# for regex_scraper.py
regex_sites:
# this key will go into the comment fields
"aliveproxy.com":
# Type of proxy, this goes directly into `proxy_type`. 'dynamic' will pull the type from the proxy_type named capture group
proxy_type: 'dynamic'
# if `replace` is specified, then this is an f-string, and iterates through the `replace` values and fetches all the resulting URLs.
url: "http://aliveproxy.com/proxy-list-port-{}"
# if proxy_type is dynamic, the proxy_type specifies what proxy_type to use
regex: '(?P<proxy_type>[^:]+)://(?P<ip>[\d\.]+):(?P<port>\d+)'
# optional, nice to have for sites with multiple pages of proxies
replace:
- "80"
- "81"
- "3128"
- "8000"
- "8080"

View File

@ -0,0 +1,56 @@
import requests
import yaml
import re
from datetime import datetime
import psycopg2
import psycopg2.extras
config = yaml.full_load(open("config.yaml"))
conn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
raw_proxies = []
scraped_proxies = []
def check_site(site, data):
text = ""
print(site)
ua = {"User-Agent": config["user_agent"]}
if data.get("replace"):
for sub in data["replace"]:
try:
r = requests.get(data["url"].format(sub), headers=ua)
text += r.text
except:
print(f"{data['url'].format(sub)} had an error")
else:
try:
r = requests.get(data["url"], headers=ua)
text += r.text
except:
print(f"{data['url']} had an error")
for proxy in re.finditer(re.compile(data['regex'], re.MULTILINE | re.DOTALL), text):
ip = proxy.group('ip')
port = int(proxy.group('port'))
if data['proxy_type'] == 'dynamic':
proxy_type = proxy.group('proxy_type')
else:
proxy_type = data["proxy_type"]
scraped_proxies.append((ip, port, proxy_type, datetime.now(), f"From {site}"))
with conn:
with conn.cursor() as curs:
for site, data in config["regex_sites"].items():
check_site(site, data)
scraped_ips = []
[scraped_ips.append(p[0]) for p in scraped_proxies]
curs.execute("select ip, port from proxies where host(ip)=ANY(%s)", (scraped_ips,))
current_proxies = list(curs.fetchall())
submitted_proxies = []
for scraped_proxy in scraped_proxies:
ip_port = (scraped_proxy[0], scraped_proxy[1])
if ip_port not in current_proxies:
submitted_proxies.append(scraped_proxy)
current_proxies.append(ip_port)
query = "insert into proxies (ip, port, proxy_type, submitted_at, comment) values %s"
psycopg2.extras.execute_values(curs, query, submitted_proxies)
print(f"submitted {len(submitted_proxies)} of {len(scraped_proxies)}")

View File

@ -0,0 +1,38 @@
import requests
import re
import psycopg2
import psycopg2.extras
from datetime import datetime
from base64 import b64decode
import yaml
config = yaml.full_load(open("config.yaml").read())
pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
r = requests.get("http://www.vpngate.net/api/iphone/", headers={"User-Agent": config["user_agent"]})
csvfile = r.text.split("\r\n")
scraped_servers = []
current_servers = []
with pgconn:
with pgconn.cursor() as curs:
curs.execute("select ip, port from proxies where proxy_type='vpngate'")
current_servers = curs.fetchall()
for server in csvfile[2:]:
try:
hostname, ip, score, ping, speed, clong, cshort, numsessions, uptime, totalusers, totaltraffic, logtype, operator, message, configdata = server.split(",")
except:
continue
configdataplain = b64decode(configdata)
remote = re.search("remote (\S+) (\d+)", configdataplain.decode())
if remote:
host = remote[1]
port = int(remote[2])
if (host, port) not in current_servers:
scraped_servers.append((host, port, host, "vpngate", datetime.now(), "vpngate proxies api"))
with pgconn:
with pgconn.cursor() as curs:
insert_query = """
insert into proxies (ip, port, exit_ip, proxy_type, submitted_at, comment)
values %s"""
psycopg2.extras.execute_values(curs, insert_query, scraped_servers)

View File

@ -0,0 +1,34 @@
import psycopg2
import psycopg2.extras
import requests
import re
import yaml
from bs4 import BeautifulSoup
from datetime import datetime
from time import sleep
config = yaml.full_load(open("config.yaml").read())
pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
current_servers = []
with pgconn:
with pgconn.cursor() as curs:
curs.execute("select ip, port from proxies where proxy_type='vpngate'")
current_servers = curs.fetchall()
proxies = []
for i in range(1,3):
r = requests.get(f"https://freevpn.gg/?p={i}", headers={"User-Agent": config["user_agent"]})
data = BeautifulSoup(r.text, features="html5lib")
for ip_data in data.find_all(class_="card-box"):
ip = ip_data.find("h3").text
port = re.search("TCP\((\d+)\)", ip_data.find(class_="card-text").text)
if port and (ip, int(port[1])) not in current_servers:
proxies.append((ip, port[1], ip, "unscanned", "vpngate", datetime.now(), "vpngate proxies freevpn"))
sleep(5)
with pgconn:
with pgconn.cursor() as curs:
insert_query = """
insert into proxies (ip, port, exit_ip, status, proxy_type, submitted_at, comment)
values %s"""
psycopg2.extras.execute_values(curs, insert_query, proxies)

6
scripts/requirements.txt Normal file
View File

@ -0,0 +1,6 @@
bs4
html5lib
psycopg2
pycurl
pyyaml
requests