123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
import string, gem2html, html2text, re, os.path
|
|
import urllib.parse as urlparse
|
|
# Utility function to parse a MIME type
|
|
def parse_mime(mimetype):
|
|
mimetype = mimetype.strip()
|
|
index = 0
|
|
type = ""
|
|
# type is everything before the /
|
|
while index<len(mimetype) and mimetype[index]!="/":
|
|
type+=mimetype[index]
|
|
index+=1
|
|
index+=1
|
|
subtype = ""
|
|
# subtype is everything after the slash and before the semicolon (if the latter exists)
|
|
while index<len(mimetype) and mimetype[index]!=";":
|
|
subtype+=mimetype[index]
|
|
index+=1
|
|
index+=1
|
|
# if there's no semicolon, there are no params
|
|
if index>=len(mimetype): return [type,subtype], dict()
|
|
params = dict()
|
|
while index<len(mimetype):
|
|
# skip whitespace
|
|
while index<len(mimetype) and mimetype[index] in string.whitespace:
|
|
index+=1
|
|
paramName = ""
|
|
# the parameter name is everything before the = or ;
|
|
while index<len(mimetype) and mimetype[index] not in "=;":
|
|
paramName+=mimetype[index]
|
|
index+=1
|
|
# if the string is over or there isn't an equals sign, there's no param value
|
|
if index>=len(mimetype) or mimetype[index]==";":
|
|
index+=1
|
|
params[paramName]=None
|
|
continue
|
|
# otherwise, grab the param value
|
|
index+=1
|
|
paramValue = ""
|
|
if mimetype[index]=='"':
|
|
index+=1
|
|
while True:
|
|
while index<len(mimetype) and mimetype[index] not in '\\"':
|
|
paramValue+=mimetype[index]
|
|
index+=1
|
|
if index>=len(mimetype): break
|
|
c = mimetype[index]
|
|
index+=1
|
|
if c=="\\":
|
|
if index>=len(mimetype):
|
|
paramValue+=c
|
|
break
|
|
paramValue+=mimetype[index]
|
|
index+=1
|
|
else:
|
|
break
|
|
# skip until next ;
|
|
while index<len(mimetype) and mimetype[index]!=";": index+=1
|
|
else:
|
|
while index<len(mimetype) and mimetype[index]!=";":
|
|
paramValue+=mimetype[index]
|
|
index+=1
|
|
if paramName: params[paramName]=paramValue
|
|
return [type, subtype], params
|
|
|
|
LINK_LINE = re.compile(r"^\[([^\]]+)\]\(([^)]+)\)$",re.MULTILINE)
|
|
LINK_INLINE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
|
|
# Convert gemtext to gopher
|
|
def gemtext2gopher(gemtext,baseurl="gemini://tilde.team",basehost="tilde.team",baseport=70,gemtexthost="tilde.team"):
|
|
# step 1, convert gemtext to html
|
|
html = gem2html.gem2html(gemtext)
|
|
# step 2, convert html to plaintext
|
|
h = html2text.HTML2Text()
|
|
h.use_automatic_links = False # only use [link text](link url) format, even in link text and link url are one and the same
|
|
h.feed(html)
|
|
text = h.finish()
|
|
h.close()
|
|
# step 3, convert plaintext to gophermap
|
|
for linkname, linkurl in LINK_LINE.finditer(text):
|
|
rawurl = f"[{linkname}]({linkurl})"
|
|
absoluteurl = urlparse.urljoin(baseurl,linkurl)
|
|
ext = os.path.splitext(absoluteurl)[1]
|
|
linktype = "1"
|
|
if ext==".gif": # g line - GIF image
|
|
linktype = "g"
|
|
elif ext in (".png",".jpg",".jpeg",".tiff"): # I line - non-GIF image
|
|
linktype = "I"
|
|
elif ext in (".bmp"): # : line - BMP image
|
|
linktype = ":"
|
|
elif ext in (".mp3",".flac",".aac"): # < line - audio file
|
|
linktype = "<"
|
|
parsed = urlparse.urlparse(absoluteurl)
|
|
selector, host, port = None, None, None
|
|
if parsed.scheme=="gopher":
|
|
selector = parsed.path
|
|
host = parsed.hostname
|
|
port = parsed.port or 70
|
|
elif parsed.scheme=="gemini":
|
|
if parsed.hostname==gemtexthost:
|
|
selector = parsed.path
|
|
host = basehost
|
|
port = baseport
|
|
else:
|
|
selector = f"/x/{parsed.scheme}/{parsed.netloc}{parsed.path}"
|
|
host = basehost
|
|
port = baseport
|
|
else:
|
|
linktype = "h" # force H link type for external links
|
|
selector = "URL:"+absoluteurl
|
|
host = basehost
|
|
port = baseport
|
|
text.replace(rawurl,f"{linktype}{linkname}\t{selector}\t{host}\t{port}")
|
|
text = LINK_INLINE.sub(lambda m: m.group(1),text)
|
|
# now apply dummy i-lines
|
|
lines = text.splitlines()
|
|
out = []
|
|
for line in lines:
|
|
if not "\t" in line:
|
|
out.append(f"i{line}\t.\tnull.host\t70")
|
|
else:
|
|
out.append(line)
|
|
return text
|