add scripts and fix config
This commit is contained in:
parent
4309c31f73
commit
23e57dfbdf
|
@ -2,9 +2,9 @@
|
|||
|
||||
A Python webapp that "herds" a list of open proxies, with an API for external programs to add and update entries.
|
||||
|
||||
This is only the webapp, [openproxyherder-extras](https://git.sr.ht/~emerson/openproxyherder-extras) has some example Python scripts that might be useful to you.
|
||||
`scripts/` contains some example scripts that might be useful.
|
||||
|
||||
## Setup
|
||||
## Setup openproxyherder
|
||||
|
||||
This is tested with Python 3.8 and 3.9, but should work with Python 3.5 and above.
|
||||
|
||||
|
@ -32,3 +32,7 @@ Copy `config.example.yaml` to `config.yaml`, edit `config.yaml` as needed.
|
|||
Install the packages in `requirements.txt` (probably a good idea to create a venv).
|
||||
|
||||
Run `python openproxyherder.py`
|
||||
|
||||
## Scripts
|
||||
|
||||
See README in `scripts/` for information about the scripts
|
|
@ -3,6 +3,6 @@ db_port: 5432
|
|||
db_user: "test"
|
||||
db_name: "openproxyherder"
|
||||
db_password: "test"
|
||||
host: "127.0.0.1"
|
||||
port: 8080
|
||||
listen_host: "127.0.0.1"
|
||||
listen_port: 8080
|
||||
access_log: True
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
Some of the proxy scrapers and checkers that can be used with the openproxyherder.
|
||||
|
||||
Works with recent py3, make a venv, install requirements.txt, copy `config.example.yaml` to `config.yaml` and change the stuff.
|
||||
|
||||
These are not shining examples of code, they're quick scripts that work (unless they don't work), meant to show how to interact with OPH.
|
||||
|
||||
No, I'm not going to give you my list of proxy sites. Do your own research.
|
||||
|
||||
Also be a good citizen of the internet.
|
||||
|
||||
## Checkers
|
||||
Scripts that check IPs listed in OPH.
|
||||
|
||||
### http_socks.py
|
||||
Checks HTTP/SOCKS(4|5) proxies. It attempts to connect to a webpage that consists solely of [connecting ip address] [string defined in `scrapers/config.yaml`]. The one in the example config is a public one you are welcome to use. If you want to use your own, nginx ssi is the easiest way, `<!--#echo var="REMOTE_ADDR" --> yourproxystring`.
|
||||
|
||||
### vpngate.py
|
||||
Checks vpngate proxies. Not much else to say here. While technically a VPN, not a proxy, locking the door but leaving the key in the lock is essentially open, and that's what VPNGate does.
|
||||
|
||||
## Gatherers
|
||||
Scripts that get proxies from various places.
|
||||
|
||||
Note that as of now, some of these dump directly into the database rather than use the OPH API, for historical reasons when OPH didn't use asyncpg and it would lock up when adding hundreds of proxies. This will probably change eventually.
|
|
@ -0,0 +1,25 @@
|
|||
dbname: openproxyherder
|
||||
dbuser: openproxyherder
|
||||
dbpassword: openproxyherder
|
||||
dbhost: "127.0.0.1"
|
||||
|
||||
# Fake UA for sites that actually check it
|
||||
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:90.0) Gecko/20100101 Firefox/90.0"
|
||||
|
||||
# for regex_scraper.py
|
||||
regex_sites:
|
||||
# this key will go into the comment fields
|
||||
"aliveproxy.com":
|
||||
# Type of proxy, this goes directly into `proxy_type`. 'dynamic' will pull the type from the proxy_type named capture group
|
||||
proxy_type: 'dynamic'
|
||||
# if `replace` is specified, then this is an f-string, and iterates through the `replace` values and fetches all the resulting URLs.
|
||||
url: "http://aliveproxy.com/proxy-list-port-{}"
|
||||
# if proxy_type is dynamic, the proxy_type specifies what proxy_type to use
|
||||
regex: '(?P<proxy_type>[^:]+)://(?P<ip>[\d\.]+):(?P<port>\d+)'
|
||||
# optional, nice to have for sites with multiple pages of proxies
|
||||
replace:
|
||||
- "80"
|
||||
- "81"
|
||||
- "3128"
|
||||
- "8000"
|
||||
- "8080"
|
|
@ -0,0 +1,56 @@
|
|||
import requests
|
||||
import yaml
|
||||
import re
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
config = yaml.full_load(open("config.yaml"))
|
||||
conn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
|
||||
raw_proxies = []
|
||||
scraped_proxies = []
|
||||
|
||||
def check_site(site, data):
|
||||
text = ""
|
||||
print(site)
|
||||
ua = {"User-Agent": config["user_agent"]}
|
||||
if data.get("replace"):
|
||||
for sub in data["replace"]:
|
||||
try:
|
||||
r = requests.get(data["url"].format(sub), headers=ua)
|
||||
text += r.text
|
||||
except:
|
||||
print(f"{data['url'].format(sub)} had an error")
|
||||
else:
|
||||
try:
|
||||
r = requests.get(data["url"], headers=ua)
|
||||
text += r.text
|
||||
except:
|
||||
print(f"{data['url']} had an error")
|
||||
|
||||
for proxy in re.finditer(re.compile(data['regex'], re.MULTILINE | re.DOTALL), text):
|
||||
ip = proxy.group('ip')
|
||||
port = int(proxy.group('port'))
|
||||
if data['proxy_type'] == 'dynamic':
|
||||
proxy_type = proxy.group('proxy_type')
|
||||
else:
|
||||
proxy_type = data["proxy_type"]
|
||||
scraped_proxies.append((ip, port, proxy_type, datetime.now(), f"From {site}"))
|
||||
|
||||
with conn:
|
||||
with conn.cursor() as curs:
|
||||
for site, data in config["regex_sites"].items():
|
||||
check_site(site, data)
|
||||
scraped_ips = []
|
||||
[scraped_ips.append(p[0]) for p in scraped_proxies]
|
||||
curs.execute("select ip, port from proxies where host(ip)=ANY(%s)", (scraped_ips,))
|
||||
current_proxies = list(curs.fetchall())
|
||||
submitted_proxies = []
|
||||
for scraped_proxy in scraped_proxies:
|
||||
ip_port = (scraped_proxy[0], scraped_proxy[1])
|
||||
if ip_port not in current_proxies:
|
||||
submitted_proxies.append(scraped_proxy)
|
||||
current_proxies.append(ip_port)
|
||||
query = "insert into proxies (ip, port, proxy_type, submitted_at, comment) values %s"
|
||||
psycopg2.extras.execute_values(curs, query, submitted_proxies)
|
||||
print(f"submitted {len(submitted_proxies)} of {len(scraped_proxies)}")
|
|
@ -0,0 +1,38 @@
|
|||
import requests
|
||||
import re
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from datetime import datetime
|
||||
from base64 import b64decode
|
||||
import yaml
|
||||
|
||||
config = yaml.full_load(open("config.yaml").read())
|
||||
pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
|
||||
r = requests.get("http://www.vpngate.net/api/iphone/", headers={"User-Agent": config["user_agent"]})
|
||||
csvfile = r.text.split("\r\n")
|
||||
scraped_servers = []
|
||||
current_servers = []
|
||||
with pgconn:
|
||||
with pgconn.cursor() as curs:
|
||||
curs.execute("select ip, port from proxies where proxy_type='vpngate'")
|
||||
current_servers = curs.fetchall()
|
||||
|
||||
for server in csvfile[2:]:
|
||||
try:
|
||||
hostname, ip, score, ping, speed, clong, cshort, numsessions, uptime, totalusers, totaltraffic, logtype, operator, message, configdata = server.split(",")
|
||||
except:
|
||||
continue
|
||||
configdataplain = b64decode(configdata)
|
||||
remote = re.search("remote (\S+) (\d+)", configdataplain.decode())
|
||||
if remote:
|
||||
host = remote[1]
|
||||
port = int(remote[2])
|
||||
if (host, port) not in current_servers:
|
||||
scraped_servers.append((host, port, host, "vpngate", datetime.now(), "vpngate proxies api"))
|
||||
|
||||
with pgconn:
|
||||
with pgconn.cursor() as curs:
|
||||
insert_query = """
|
||||
insert into proxies (ip, port, exit_ip, proxy_type, submitted_at, comment)
|
||||
values %s"""
|
||||
psycopg2.extras.execute_values(curs, insert_query, scraped_servers)
|
|
@ -0,0 +1,34 @@
|
|||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
import re
|
||||
import yaml
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
config = yaml.full_load(open("config.yaml").read())
|
||||
pgconn = psycopg2.connect(dbname=config["dbname"], user=config["dbuser"], password=config["dbpassword"], host=config["dbhost"])
|
||||
current_servers = []
|
||||
with pgconn:
|
||||
with pgconn.cursor() as curs:
|
||||
curs.execute("select ip, port from proxies where proxy_type='vpngate'")
|
||||
current_servers = curs.fetchall()
|
||||
|
||||
proxies = []
|
||||
for i in range(1,3):
|
||||
r = requests.get(f"https://freevpn.gg/?p={i}", headers={"User-Agent": config["user_agent"]})
|
||||
data = BeautifulSoup(r.text, features="html5lib")
|
||||
for ip_data in data.find_all(class_="card-box"):
|
||||
ip = ip_data.find("h3").text
|
||||
port = re.search("TCP\((\d+)\)", ip_data.find(class_="card-text").text)
|
||||
if port and (ip, int(port[1])) not in current_servers:
|
||||
proxies.append((ip, port[1], ip, "unscanned", "vpngate", datetime.now(), "vpngate proxies freevpn"))
|
||||
sleep(5)
|
||||
|
||||
with pgconn:
|
||||
with pgconn.cursor() as curs:
|
||||
insert_query = """
|
||||
insert into proxies (ip, port, exit_ip, status, proxy_type, submitted_at, comment)
|
||||
values %s"""
|
||||
psycopg2.extras.execute_values(curs, insert_query, proxies)
|
|
@ -0,0 +1,6 @@
|
|||
bs4
|
||||
html5lib
|
||||
psycopg2
|
||||
pycurl
|
||||
pyyaml
|
||||
requests
|
Loading…
Reference in New Issue