Add reindexer
This commit is contained in:
parent
3b0aa5b8a4
commit
42150f43ab
|
@ -0,0 +1,175 @@
|
|||
"""Zenit - the Molniya indexer.
|
||||
|
||||
Zenit was a series of military photoreconnaissance satellites launched by the Soviet Union between 1961 and 1994. In keeping with the Soviet spy satellite theme, I chose this name for the indexer."""
|
||||
import json, urllib.parse, traceback, sys, ssl, socket, string
|
||||
from config import *
|
||||
# stolen from AV-98
|
||||
urllib.parse.uses_relative.append("gemini")
|
||||
urllib.parse.uses_netloc.append("gemini")
|
||||
|
||||
# Load URL list
|
||||
URLS = [MAIN_PAGE]
|
||||
try:
|
||||
with open("orbit.json") as f:
|
||||
URLS = json.load(f)["urls"]
|
||||
except IOError as e: # we can be a bit more outgoing about our errors here
|
||||
print(f"Error loading orbit.json: {e!r}")
|
||||
print("Continuing on anyways with a list containing only the URL of the main page.")
|
||||
except KeyError as e:
|
||||
print("Malformed orbit.json: no urls list")
|
||||
print("Continuing on anyways with a list containing only the URL of the main page.")
|
||||
except:
|
||||
print("Error loading orbit.json (not IOError or KeyError):")
|
||||
traceback.print_exc()
|
||||
print("Exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Utility function to parse a MIME type
|
||||
def parse_mime(mimetype):
|
||||
mimetype = mimetype.strip()
|
||||
index = 0
|
||||
type = ""
|
||||
# type is everything before the /
|
||||
while index<len(mimetype) and mimetype[index]!="/":
|
||||
type+=mimetype[index]
|
||||
index+=1
|
||||
index+=1
|
||||
subtype = ""
|
||||
# subtype is everything after the slash and before the semicolon (if the latter exists)
|
||||
while index<len(mimetype) and mimetype[index]!=";":
|
||||
subtype+=mimetype[index]
|
||||
index+=1
|
||||
index+=1
|
||||
# if there's no semicolon, there are no params
|
||||
if index>=len(mimetype): return [type,subtype], dict()
|
||||
params = dict()
|
||||
while index<len(mimetype):
|
||||
# skip whitespace
|
||||
while index<len(mimetype) and mimetype[index] in string.whitespace:
|
||||
index+=1
|
||||
paramName = ""
|
||||
# the parameter name is everything before the = or ;
|
||||
while index<len(mimetype) and mimetype[index] not in "=;":
|
||||
paramName+=mimetype[index]
|
||||
index+=1
|
||||
# if the string is over or there isn't an equals sign, there's no param value
|
||||
if index>=len(mimetype) or mimetype[index]==";":
|
||||
index+=1
|
||||
params[paramName]=None
|
||||
continue
|
||||
# otherwise, grab the param value
|
||||
index+=1
|
||||
paramValue = ""
|
||||
if mimetype[index]=='"':
|
||||
index+=1
|
||||
while True:
|
||||
while index<len(mimetype) and mimetype[index] not in '\\"':
|
||||
paramValue+=mimetype[index]
|
||||
index+=1
|
||||
if index>=len(mimetype): break
|
||||
c = mimetype[index]
|
||||
index+=1
|
||||
if c=="\\":
|
||||
if index>=len(mimetype):
|
||||
paramValue+=c
|
||||
break
|
||||
paramValue+=mimetype[index]
|
||||
index+=1
|
||||
else:
|
||||
break
|
||||
# skip until next ;
|
||||
while index<len(mimetype) and mimetype[index]!=";": index+=1
|
||||
else:
|
||||
while index<len(mimetype) and mimetype[index]!=";":
|
||||
paramValue+=mimetype[index]
|
||||
index+=1
|
||||
if paramName: params[paramName]=paramValue
|
||||
return [type, subtype], params
|
||||
|
||||
# Utility function to grab content from a URL
|
||||
# Context setup courtesy of my own half-baked spartan client
|
||||
def grab_content(url,redirect_num=0):
|
||||
if redirect_num>=5:
|
||||
return "Too many redirects!","text/plain"
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
if "ctx" not in globals():
|
||||
ctx = ssl.create_default_context()
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
globals()["ctx"]=ctx
|
||||
else:
|
||||
ctx = globals()["ctx"]
|
||||
with socket.socket(socket.AF_INET,socket.SOCK_STREAM) as s:
|
||||
ss = ctx.wrap_socket(s,server_hostname=parsed.hostname)
|
||||
try:
|
||||
ss.connect((parsed.hostname,parsed.port or 1965))
|
||||
except ConnectionRefusedError:
|
||||
return b'', 'application/octet-stream'
|
||||
ss.send((url.strip()+"\r\n").encode("UTF-8"))
|
||||
out = b""
|
||||
while (data:=ss.recv(2048)):
|
||||
out+=data
|
||||
header, content = out.split(b"\r\n",1)
|
||||
status, meta = header.decode("utf-8").split(None,1)
|
||||
assert len(meta)<1024
|
||||
if status[0]=="2":
|
||||
types, params = parse_mime(meta)
|
||||
if types[0]=="text":
|
||||
# assume UTF-8
|
||||
charset = "utf-8"
|
||||
# ...but if another charset is given accept it
|
||||
if "charset" in params:
|
||||
charset = params["charset"]
|
||||
# decode and return
|
||||
return content.decode(charset), meta
|
||||
else:
|
||||
# if it's not a text result, just return the content
|
||||
return content, meta
|
||||
elif status[0]=="3":
|
||||
# if it's a redirect, then let's follow it
|
||||
return grab_content(meta,redirect_num+1)
|
||||
else:
|
||||
# Either:
|
||||
# 1x - it wants an input, which we have no agency to give
|
||||
# 6x - it wants a client cert, which we have no agency to give
|
||||
# 4x or 5x - there's an error
|
||||
# Return the header with a mimetype of text/plain. If this were a real library I might throw an error here, but this is just to make Zenit work.
|
||||
return header.decode("utf-8"), "text/plain"
|
||||
|
||||
CAPSULES_IN_ORBIT = set()
|
||||
import copy
|
||||
links = copy.deepcopy(URLS)[1:] # skip main link
|
||||
|
||||
for link in links:
|
||||
URLS.remove(link) # assume link doesn't belong
|
||||
# Things to consider for a new link:
|
||||
# Does its capsule already have representation in the orbit?
|
||||
capsule = determine_capsule(urllib.parse.urlparse(link))
|
||||
if capsule in CAPSULES_IN_ORBIT:
|
||||
# skip
|
||||
print(f"Skipping {link} (capsule already in orbit)...")
|
||||
continue
|
||||
# Does it link to any of the required links?
|
||||
response, mime = grab_content(link)
|
||||
try:
|
||||
assert mime.startswith("text/gemini"), f"{mime} response isn't text/gemini and therefore can't link back"
|
||||
links_to_orbit = False
|
||||
for line in response.splitlines():
|
||||
if line.startswith("=>"):
|
||||
parts = line.split(None,2)
|
||||
for reqlink in REQUIRED_LINKS:
|
||||
links_to_orbit=links_to_orbit or parts[1].startswith(reqlink)
|
||||
assert links_to_orbit, "doesn't link back to orbit"
|
||||
except AssertionError as e:
|
||||
print(f"Skipping {link} ({e.args[0]})...")
|
||||
continue
|
||||
# If we haven't continue'd by now, the link meets all of the criteria
|
||||
print(f"Adding {link} to the orbit...")
|
||||
URLS.append(link)
|
||||
CAPSULES_IN_ORBIT.add(capsule)
|
||||
modified_orbit = True
|
||||
|
||||
if modified_orbit:
|
||||
print("Saving modified orbit...")
|
||||
with open("orbit.json","w") as f:
|
||||
json.dump(dict(urls=URLS),f)
|
Loading…
Reference in New Issue