this is a huge refactorisation so gemtext/html renderers are now handled by each GeminiItem and not by the GeminiClient. Expect bugs

This commit is contained in:
Lionel Dricot 2022-01-17 23:37:12 +01:00
parent 76c2d9a491
commit e40c0439b2
1 changed files with 286 additions and 240 deletions

View File

@ -155,6 +155,192 @@ standard_ports = {
"https" : 443,
}
# First, we define the gemtext and html renderers, outside of the rest
# (They could later be factorized in other files or replaced)
# Gemtext Rendering Engine
# this method takes the original gemtext and returns
# [rendered_text,links_table]
def render_gemtext(gemtext, width=80):
links = []
preformatted = False
rendered_text = ""
#This local method takes a line and apply the ansi code given as "color"
#The whole line is then wrapped and ansi code are ended.
def wrap_line(line,color=None,i_indent="",s_indent=""):
wrapped = textwrap.wrap(line,width,initial_indent=i_indent,\
subsequent_indent=s_indent)
final = ""
for l in wrapped:
if color:
l = color + l + "\x1b[0m"
if l.strip() != "":
final += l + "\n"
return final
def format_link(url,index,name=None):
if "://" in url:
protocol,adress = url.split("://")
protocol = " %s" %protocol
else:
adress = url
protocol = ""
if "gemini" in protocol:
protocol = ""
if not name:
name = adress
line = "[%d%s] %s" % (index, protocol, name)
return line
for line in gemtext.splitlines():
if line.startswith("```"):
preformatted = not preformatted
elif preformatted:
rendered_text = line + "\n"
elif line.startswith("=>"):
strippedline = line[2:].strip()
if strippedline:
links.append(strippedline)
splitted = strippedline.split(maxsplit=1)
url = splitted[0]
name = None
if len(splitted) > 1:
name = splitted[1]
link = format_link(url,len(links),name=name)
startpos = link.find("] ") + 2
wrapped = wrap_line(link,s_indent=startpos*" ")
rendered_text += wrapped
elif line.startswith("* "):
line = line[1:].lstrip("\t ")
rendered_text += textwrap.fill(line, width, initial_indent = "",
subsequent_indent=" ") + "\n"
elif line.startswith(">"):
line = line[1:].lstrip("\t ")
rendered_text += textwrap.fill(line,width, initial_indent = "> ",
subsequent_indent="> ") + "\n"
elif line.startswith("###"):
line = line[3:].lstrip("\t ")
rendered_text += wrap_line(line, color="\x1b[34m\x1b[2m")
elif line.startswith("##"):
line = line[2:].lstrip("\t ")
rendered_text += wrap_line(line, color="\x1b[34m")
elif line.startswith("#"):
line = line[1:].lstrip("\t ")
rendered_text += wrap_line(line,color="\x1b[1;34m\x1b[4m")
else:
rendered_text += wrap_line(line).rstrip() + "\n"
return rendered_text, links
# Our own HTML engine (crazy, isnt it?)
def render_html(body,width=80):
if not _DO_HTML:
print("HTML document detected. Please install python-bs4 and python readability.")
return
# This method recursively parse the HTML
r_body = ""
links = []
def recursive_render(element,indent=""):
rendered_body = ""
#print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
if element.name == "blockquote":
for child in element.children:
rendered_body += recursive_render(child,indent="\t").rstrip("\t")
elif element.name == "div":
rendered_body += "\n"
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
elif element.name in ["h1","h2","h3","h4","h5","h6"]:
line = element.get_text()
if element.name in ["h1","h2"]:
rendered_body += "\n"+"\x1b[1;34m\x1b[4m" + line + "\x1b[0m"+"\n"
elif element.name in ["h3","h4"]:
rendered_body += "\n" + "\x1b[34m" + line + "\x1b[0m" + "\n"
else:
rendered_body += "\n" + "\x1b[34m\x1b[2m" + line + "\x1b[0m" + "\n"
elif element.name == "pre":
rendered_body += "\n"
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
rendered_body += "\n\n"
elif element.name == "li":
line = ""
for child in element.children:
line += recursive_render(child,indent=indent).strip("\n")
#print("in li: ***%s***"%line)
rendered_body += " * " + line.strip() + "\n"
elif element.name in ["code","em","b","i"]:
# we dont do anything with those markup right now. Maybe later?
for child in element.children:
rendered_body += recursive_render(child,indent=indent).strip("\n")
elif element.name == "p":
temp_str = ""
for child in element.children:
temp_str += recursive_render(child,indent=indent)
rendered_body = temp_str + "\n\n"
elif element.name == "a":
text = element.get_text().strip()
link = element.get('href')
if link:
links.append(link)
link_id = " [%s] "%(len(links))
rendered_body = "\x1b[34m\x1b[2m " + text + link_id + "\x1b[0m"
else:
#No real link found
rendered_body = text
elif element.name == "br":
rendered_body = "\n"
elif element.string:
#print("tag without children:",element.name)
#print("string : **%s** "%element.string.strip())
#print("########")
rendered_body = element.string.strip("\n").strip("\t")
else:
#print("tag children:",element.name)
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
#print("body for element %s: %s"%(element.name,rendered_body))
return indent + rendered_body
# the real render_html hearth
readable = Document(body)
title = readable.short_title()
summary = readable.summary()
r_body += "\x1b[1;34m\x1b[4m" + title + "\x1b[0m""\n"
soup = BeautifulSoup(summary, 'html.parser')
rendered_body = ""
if soup and soup.body :
for el in soup.body.contents:
rendered_body += recursive_render(el)
paragraphs = rendered_body.split("\n\n")
for par in paragraphs:
lines = par.splitlines()
for line in lines:
if line.startswith("\t"):
i_indent = " "
s_indent = i_indent
line = line.strip("\t")
elif line.startswith(" * "):
i_indent = "" # we keep the initial bullet)
s_indent = " "
else:
i_indent = ""
s_indent = i_indent
if line.strip() != "":
wrapped = textwrap.fill(line,width,initial_indent=i_indent,
subsequent_indent=s_indent)
wrapped += "\n"
else:
wrapped = ""
r_body += wrapped
r_body += "\n"
return r_body,links
# Offpunk is organized as following:
# - a GeminiClient instance which handles the browsing of GeminiItem.
# - Theres only one GeminiClient. Each page is a GeminiItem (name is historical, as
# it could be non-gemini content)
# - A GeminiItem is created with an URL from which it will derives content.
# - Content include : a title, a body, an ansi-rendered-body and a list of links.
# - Each GeminiItem generates a "cache_path" in which it maintains a cached version of content.
class GeminiItem():
def __init__(self, url, name=""):
@ -163,6 +349,8 @@ class GeminiItem():
self.url = fix_ipv6_url(url).strip()
self.name = name
self.mime = None
self.renderer = None
self.links = None
parsed = urllib.parse.urlparse(self.url)
if "./" in url or url[0] == "/":
self.scheme = "localhost"
@ -201,7 +389,6 @@ class GeminiItem():
self.url += "/"
if self._cache_path.endswith("/"):
self._cache_path += index
self.port = parsed.port or standard_ports.get(self.scheme, 0)
def get_title(self):
@ -279,6 +466,54 @@ class GeminiItem():
print("ERROR: NOCACHE for %s" %self._cache_path)
return FIXME
#def set_renderer(self,renderer):
# self.renderer = renderer
def get_links(self):
if self.links:
return self.links
self.links = []
r_body,links = self.renderer(self.get_body())
for l in links:
#split between link and potential name
splitted = l.split(maxsplit=1)
url = self.absolutise_url(splitted[0])
if looks_like_url(url):
if len(splitted) > 1:
newgi = GeminiItem(url,splitted[1])
else:
newgi = GeminiItem(url)
self.links.append(newgi)
return self.links
# Red title above rendered content
def _make_terminal_title(self):
title = self.get_title()
#FIXME : how do I know that Im offline_only ?
if self.is_cache_valid(): #and self.offline_only and not self.local:
last_modification = self.cache_last_modified()
str_last = time.ctime(last_modification)
title += " \x1b[0;31m(last accessed on %s)"%str_last
rendered_title = "\x1b[31m\x1b[1m"+ title + "\x1b[0m"
#FIXME:width wrapped = textwrap.fill(rendered_title,self.options["width"])
wrapped = textwrap.fill(rendered_title,80)
return wrapped + "\n"
def get_rendered_body(self):
if not self.renderer:
mime = self.get_mime()
if mime == "text/gemini":
self.renderer = render_gemtext
elif mime == "text/html":
self.renderer = render_html
if self.renderer:
body = self.get_body()
r_body, links = self.renderer(body)
to_return = self._make_terminal_title() + r_body
return to_return
else:
return None
def get_filename(self):
filename = os.path.basename(self._cache_path)
return filename
@ -311,12 +546,16 @@ class GeminiItem():
def get_mime(self):
if self.is_cache_valid():
mime,encoding = mimetypes.guess_type(self._cache_path,strict=False)
if self.local:
path = self.path
else:
path = self._cache_path
mime,encoding = mimetypes.guess_type(path,strict=False)
#gmi Mimetype is not recognized yet
if not mime and self._cache_path.endswith(".gmi"):
if not mime and path.endswith(".gmi"):
mime = "text/gemini"
elif not mime and _HAS_MAGIC :
mime = magic.from_file(self._cache_path,mime=True)
mime = magic.from_file(path,mime=True)
elif not _HAS_MAGIC :
print("Cannot guess the mime type of the file. Install Python-magic")
if mime.startswith("text"):
@ -339,6 +578,8 @@ class GeminiItem():
cache.write(str(datetime.datetime.now())+"\n")
cache.write("ERROR while caching %s\n" %self.url)
cache.write(str(err))
cache.write("If you believe this error was temporary, type ""reload"".\n")
cache.write("The ressource will be tentatively fetched during next sync.\n")
cache.write("\n")
cache.close()
@ -556,19 +797,19 @@ However, you can use `set gopher_proxy hostname:port` to tell it about a
Gopher-to-Gemini proxy (such as a running Agena instance), in which case
you'll be able to transparently follow links to Gopherspace!""")
return
elif gi.local:
if os.path.exists(gi.path):
with open(gi.path,'r') as f:
self._handle_gemtext(gi)
self.gi = gi
self._update_history(gi)
return
else:
print("Sorry, file %s does not exist."%gi.path)
return
elif gi.scheme not in ("gemini", "gopher", "http", "https") and not self.sync_only:
print("Sorry, no support for {} links.".format(gi.scheme))
return
#elif gi.local:
# if os.path.exists(gi.path):
# with open(gi.path,'r') as f:
# self._handle_gemtext(gi)
# self.gi = gi
# self._update_history(gi)
# return
# else:
# print("Sorry, file %s does not exist."%gi.path)
# return
#elif gi.scheme not in ("gemini", "gopher", "http", "https") and not self.sync_only:
# print("Sorry, no support for {} links.".format(gi.scheme))
# return
# Obey permanent redirects
if gi.url in self.permanent_redirects:
@ -633,13 +874,25 @@ you'll be able to transparently follow links to Gopherspace!""")
return
# Pass file to handler, unless we were asked not to
#SPECIFICGEMINI :default handler should be provided by the GI.
if gi and handle :
if gi.get_mime() == "text/gemini":
self._handle_gemtext(gi, display=not self.sync_only)
elif gi.get_mime() == "text/html":
self._handle_html(gi,display=not self.sync_only)
elif not self.sync_only :
if gi :
rendered_body = gi.get_rendered_body()
display = handle and not self.sync_only
if rendered_body:
self.index = gi.get_links()
self.lookup = self.index
self.page_index = 0
self.index_index = -1
if display:
# We actually put the body in a tmpfile before giving it to less
if self.idx_filename:
os.unlink(self.idx_filename)
tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
tmpf.write(rendered_body)
tmpf.close()
self.idx_filename = tmpf.name
cmd_str = self._get_handler_cmd("text/gemini")
subprocess.run(shlex.split(cmd_str % tmpf.name))
elif display :
cmd_str = self._get_handler_cmd(gi.get_mime())
try:
# get tmpfile from gi !
@ -1082,210 +1335,7 @@ you'll be able to transparently follow links to Gopherspace!""")
self._debug("Using handler: %s" % cmd_str)
return cmd_str
# Red title above rendered content
def _make_terminal_title(self,gi):
title = gi.get_title()
if gi.is_cache_valid() and self.offline_only and not gi.local:
last_modification = gi.cache_last_modified()
str_last = time.ctime(last_modification)
title += " \x1b[0;31m(last accessed on %s)"%str_last
rendered_title = "\x1b[31m\x1b[1m"+ title + "\x1b[0m"
wrapped = textwrap.fill(rendered_title,self.options["width"])
return wrapped + "\n"
# Our own HTML engine (crazy, isnt it?)
def _handle_html(self,gi,display=True):
if not _DO_HTML:
print("HTML document detected. Please install python-bs4 and python readability.")
return
# This method recursively parse the HTML
def recursive_render(element,indent=""):
rendered_body = ""
#print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
if element.name == "blockquote":
for child in element.children:
rendered_body += recursive_render(child,indent="\t").rstrip("\t")
elif element.name == "div":
rendered_body += "\n"
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
elif element.name in ["h1","h2","h3","h4","h5","h6"]:
line = element.get_text()
if element.name in ["h1","h2"]:
rendered_body += "\n"+"\x1b[1;34m\x1b[4m" + line + "\x1b[0m"+"\n"
elif element.name in ["h3","h4"]:
rendered_body += "\n" + "\x1b[34m" + line + "\x1b[0m" + "\n"
else:
rendered_body += "\n" + "\x1b[34m\x1b[2m" + line + "\x1b[0m" + "\n"
elif element.name == "pre":
rendered_body += "\n"
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
rendered_body += "\n\n"
elif element.name == "li":
line = ""
for child in element.children:
line += recursive_render(child,indent=indent).strip("\n")
#print("in li: ***%s***"%line)
rendered_body += " * " + line.strip() + "\n"
elif element.name in ["code","em","b","i"]:
# we dont do anything with those markup right now. Maybe later?
for child in element.children:
rendered_body += recursive_render(child,indent=indent).strip("\n")
elif element.name == "p":
temp_str = ""
for child in element.children:
temp_str += recursive_render(child,indent=indent)
rendered_body = temp_str + "\n\n"
elif element.name == "a":
text = element.get_text().strip()
link = element.get('href')
if link:
line = "=> " + link + " " +text
link_id = " [%s] "%(len(self.index)+1)
temp_gi = GeminiItem.from_map_line(line, gi)
if temp_gi:
self.index.append(temp_gi)
rendered_body = "\x1b[34m\x1b[2m " + text + link_id + "\x1b[0m"
else:
#No real link found
rendered_body = text
elif element.name == "br":
rendered_body = "\n"
elif element.string:
#print("tag without children:",element.name)
#print("string : **%s** "%element.string.strip())
#print("########")
rendered_body = element.string.strip("\n").strip("\t")
else:
#print("tag children:",element.name)
for child in element.children:
rendered_body += recursive_render(child,indent=indent)
#print("body for element %s: %s"%(element.name,rendered_body))
return indent + rendered_body
# the real _handle_html method
self.index = []
if self.idx_filename:
os.unlink(self.idx_filename)
tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
self.idx_filename = tmpf.name
tmpf.write(self._make_terminal_title(gi))
readable = Document(gi.get_body())
title = readable.short_title()
summary = readable.summary()
tmpf.write("\x1b[1;34m\x1b[4m" + title + "\x1b[0m""\n")
#summary = gi.get_body()
soup = BeautifulSoup(summary, 'html.parser')
rendered_body = ""
if soup and soup.body :
for el in soup.body.contents:
rendered_body += recursive_render(el)
paragraphs = rendered_body.split("\n\n")
for par in paragraphs:
lines = par.splitlines()
for line in lines:
if line.startswith("\t"):
i_indent = " "
s_indent = i_indent
line = line.strip("\t")
elif line.startswith(" * "):
i_indent = "" # we keep the initial bullet)
s_indent = " "
else:
i_indent = ""
s_indent = i_indent
if line.strip() != "":
wrapped = textwrap.fill(line,self.options["width"],\
initial_indent=i_indent,subsequent_indent=s_indent)
wrapped += "\n"
else:
wrapped = ""
tmpf.write(wrapped)
tmpf.write("\n")
tmpf.close()
self.lookup = self.index
self.page_index = 0
self.index_index = -1
if display:
cmd_str = self._get_handler_cmd("text/gemini")
subprocess.call(shlex.split(cmd_str % self.idx_filename))
# Gemtext Rendering Engine
# this method renders the original Gemtext then call the handler to display it.
def _handle_gemtext(self, menu_gi, display=True):
self.index = []
preformatted = False
if self.idx_filename:
os.unlink(self.idx_filename)
# this tempfile will contains a parsed version of the gemtext
# to display it. This is the output, not native gemtext.
tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
self.idx_filename = tmpf.name
tmpf.write(self._make_terminal_title(menu_gi))
#This local method takes a line and apply the ansi code given as "color"
#The whole line is then wrapped and ansi code are ended.
def wrap_line(line,color=None,i_indent="",s_indent=""):
wrapped = textwrap.wrap(line,self.options["width"],\
initial_indent=i_indent,subsequent_indent=s_indent)
final = ""
for l in wrapped:
if color:
l = color + l + "\x1b[0m"
if l.strip() != "":
final += l + "\n"
return final
for line in menu_gi.get_body().splitlines():
if line.startswith("```"):
preformatted = not preformatted
elif preformatted:
tmpf.write(line + "\n")
elif line.startswith("=>"):
try:
gi = GeminiItem.from_map_line(line, menu_gi)
if gi:
self.index.append(gi)
#tmpf.write(self._format_geminiitem(len(self.index), gi) + "\n")
#tentative to wrapp long links. Not sure it worth the trouble
link = self._format_geminiitem(len(self.index), gi)
startpos = link.find("] ") + 2
wrapped = wrap_line(link,s_indent=startpos*" ")
tmpf.write(wrapped)
else:
self._debug("Skipping possible link: %s" % line)
except:
self._debug("Skipping possible link: %s" % line)
elif line.startswith("* "):
line = line[1:].lstrip("\t ")
tmpf.write(textwrap.fill(line, self.options["width"],
initial_indent = "", subsequent_indent=" ") + "\n")
elif line.startswith(">"):
line = line[1:].lstrip("\t ")
tmpf.write(textwrap.fill(line, self.options["width"],
initial_indent = "> ", subsequent_indent="> ") + "\n")
elif line.startswith("###"):
line = line[3:].lstrip("\t ")
#tmpf.write("\x1b[4m" + line + "\x1b[0m""\n")
tmpf.write(wrap_line(line, color="\x1b[34m\x1b[2m"))
elif line.startswith("##"):
line = line[2:].lstrip("\t ")
tmpf.write(wrap_line(line, color="\x1b[34m"))
elif line.startswith("#"):
line = line[1:].lstrip("\t ")
tmpf.write(wrap_line(line,color="\x1b[1;34m\x1b[4m"))
else:
tmpf.write(wrap_line(line).rstrip() + "\n")
tmpf.close()
self.lookup = self.index
self.page_index = 0
self.index_index = -1
if display:
cmd_str = self._get_handler_cmd("text/gemini")
subprocess.call(shlex.split(cmd_str % self.idx_filename))
#TODO:remove format_geminiitem
def _format_geminiitem(self, index, gi, url=False):
protocol = "" if gi.scheme == "gemini" else " %s" % gi.scheme
line = "[%d%s] %s" % (index, protocol, gi.name or gi.url)
@ -1958,18 +2008,14 @@ Bookmarks are stored using the 'add' command."""
if len(args.split()) > 1 or (args and not args.isnumeric()):
print("bookmarks command takes a single integer argument!")
return
with open(bm_file, "r") as fp:
#else:
gi = GeminiItem("localhost:/" + bm_file,"Bookmarks")
gi.body = fp.read()
# We dont display bookmarks if accessing directly one
# or if in sync_only
display = not ( args or self.sync_only)
self._handle_gemtext(gi, display = display)
#self._go_to_gi(gi,handle=display)
if args:
# Use argument as a numeric index
self.default(line)
gi = GeminiItem("localhost:/" + bm_file,"Bookmarks")
# We dont display bookmarks if accessing directly one
# or if in sync_only
display = not ( args or self.sync_only)
self._go_to_gi(gi,handle=display)
if args:
# Use argument as a numeric index
self.default(line)
def list_add_line(self,line,list):
list_path = os.path.join(self.config_dir, "lists/%s.gmi"%list)