this is a huge refactorisation so gemtext/html renderers are now handled by each GeminiItem and not by the GeminiClient. Expect bugs

2022-01-17 23:37:12 +01:00 · 2022-01-17 23:37:12 +01:00 · e40c0439b2
parent 76c2d9a491
commit e40c0439b2
1 changed files with 286 additions and 240 deletions
--- a/offpunk.py
+++ b/offpunk.py
@ -155,6 +155,192 @@ standard_ports = {
        "https" : 443,
 }

+# First, we define the gemtext and html renderers, outside of the rest
+# (They could later be factorized in other files or replaced)
+
+# Gemtext Rendering Engine
+# this method takes the original gemtext and returns
+# [rendered_text,links_table]
+def render_gemtext(gemtext, width=80):
+    links = []
+    preformatted = False
+    rendered_text = ""
+    #This local method takes a line and apply the ansi code given as "color"
+    #The whole line is then wrapped and ansi code are ended.
+    def wrap_line(line,color=None,i_indent="",s_indent=""):
+        wrapped = textwrap.wrap(line,width,initial_indent=i_indent,\
+                                subsequent_indent=s_indent)
+        final = ""
+        for l in wrapped:
+            if color:
+                l = color + l + "\x1b[0m"
+            if l.strip() != "":
+                final += l + "\n"
+        return final
+    def format_link(url,index,name=None):
+        if "://" in url:
+            protocol,adress = url.split("://")
+            protocol = " %s" %protocol
+        else:
+            adress = url
+            protocol = ""
+        if "gemini" in protocol:
+            protocol = ""
+        if not name:
+            name = adress
+        line = "[%d%s] %s" % (index, protocol, name)
+        return line
+    for line in gemtext.splitlines():
+        if line.startswith("```"):
+            preformatted = not preformatted
+        elif preformatted:
+            rendered_text = line + "\n"
+        elif line.startswith("=>"):
+            strippedline = line[2:].strip()
+            if strippedline:
+                links.append(strippedline)        
+                splitted = strippedline.split(maxsplit=1)
+                url = splitted[0]
+                name = None
+                if len(splitted) > 1:
+                    name = splitted[1]
+                link = format_link(url,len(links),name=name)
+                startpos = link.find("] ") + 2
+                wrapped = wrap_line(link,s_indent=startpos*" ")
+                rendered_text += wrapped
+        elif line.startswith("* "):
+            line = line[1:].lstrip("\t ")
+            rendered_text += textwrap.fill(line, width, initial_indent = "• ", 
+                                            subsequent_indent="  ") + "\n"
+        elif line.startswith(">"):
+            line = line[1:].lstrip("\t ")
+            rendered_text += textwrap.fill(line,width, initial_indent = "> ", 
+                                            subsequent_indent="> ") + "\n"
+        elif line.startswith("###"):
+            line = line[3:].lstrip("\t ")
+            rendered_text += wrap_line(line, color="\x1b[34m\x1b[2m")
+        elif line.startswith("##"):
+            line = line[2:].lstrip("\t ")
+            rendered_text += wrap_line(line, color="\x1b[34m")
+        elif line.startswith("#"):
+            line = line[1:].lstrip("\t ")
+            rendered_text += wrap_line(line,color="\x1b[1;34m\x1b[4m")
+        else:
+            rendered_text += wrap_line(line).rstrip() + "\n"
+    return rendered_text, links
+
+# Our own HTML engine (crazy, isn’t it?)
+def render_html(body,width=80):
+    if not _DO_HTML:
+        print("HTML document detected. Please install python-bs4 and python readability.")
+        return
+    # This method recursively parse the HTML
+    r_body = ""
+    links = []
+    def recursive_render(element,indent=""):
+        rendered_body = ""
+        #print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
+        if element.name == "blockquote":
+            for child in element.children:
+                rendered_body +=  recursive_render(child,indent="\t").rstrip("\t")
+        elif element.name == "div":
+            rendered_body += "\n"
+            for child in element.children:
+                rendered_body += recursive_render(child,indent=indent)
+        elif element.name in ["h1","h2","h3","h4","h5","h6"]:
+            line = element.get_text()
+            if element.name in ["h1","h2"]:
+                rendered_body += "\n"+"\x1b[1;34m\x1b[4m" + line + "\x1b[0m"+"\n"
+            elif element.name in ["h3","h4"]:
+                rendered_body += "\n" + "\x1b[34m" + line + "\x1b[0m" + "\n"
+            else:
+                rendered_body += "\n" + "\x1b[34m\x1b[2m" + line + "\x1b[0m" + "\n"
+        elif element.name == "pre":
+            rendered_body += "\n"
+            for child in element.children:
+               rendered_body += recursive_render(child,indent=indent)
+            rendered_body += "\n\n"
+        elif element.name == "li":
+            line = ""
+            for child in element.children:
+                line += recursive_render(child,indent=indent).strip("\n")
+                #print("in li: ***%s***"%line)
+            rendered_body += " * " + line.strip() + "\n"
+        elif element.name in ["code","em","b","i"]:
+            # we don’t do anything with those markup right now. Maybe later?
+            for child in element.children:
+                rendered_body += recursive_render(child,indent=indent).strip("\n")
+        elif element.name == "p":
+            temp_str = ""
+            for child in element.children:
+                temp_str += recursive_render(child,indent=indent)
+            rendered_body = temp_str + "\n\n"
+        elif element.name == "a":
+            text = element.get_text().strip()
+            link = element.get('href')
+            if link:
+                links.append(link)
+                link_id = " [%s] "%(len(links))
+                rendered_body = "\x1b[34m\x1b[2m " + text + link_id + "\x1b[0m"
+            else:
+                #No real link found
+                rendered_body = text
+        elif element.name == "br":
+            rendered_body = "\n"
+        elif element.string:
+            #print("tag without children:",element.name)
+            #print("string : **%s** "%element.string.strip())
+            #print("########")
+            rendered_body = element.string.strip("\n").strip("\t")
+        else:
+            #print("tag children:",element.name)
+            for child in element.children:
+                rendered_body += recursive_render(child,indent=indent)
+        #print("body for element %s: %s"%(element.name,rendered_body))
+        return indent + rendered_body
+
+    # the real render_html hearth
+    readable = Document(body)
+    title = readable.short_title()
+    summary = readable.summary()
+    r_body += "\x1b[1;34m\x1b[4m" + title + "\x1b[0m""\n"
+    soup = BeautifulSoup(summary, 'html.parser')
+    rendered_body = ""
+    if soup and soup.body :
+        for el in soup.body.contents:
+            rendered_body += recursive_render(el)
+        paragraphs = rendered_body.split("\n\n")
+        for par in paragraphs:
+            lines = par.splitlines()
+            for line in lines:
+                if line.startswith("\t"):
+                    i_indent = "   "
+                    s_indent = i_indent
+                    line = line.strip("\t")
+                elif line.startswith(" * "):
+                    i_indent = ""  # we keep the initial bullet)
+                    s_indent = "   "
+                else:
+                    i_indent = ""
+                    s_indent = i_indent
+                if line.strip() != "":
+                    wrapped = textwrap.fill(line,width,initial_indent=i_indent,
+                                            subsequent_indent=s_indent)
+                    wrapped += "\n"
+                else:
+                    wrapped = ""
+                r_body += wrapped
+            r_body += "\n"
+    return r_body,links
+
+# Offpunk is organized as following:
+# - a GeminiClient instance which handles the browsing of GeminiItem.
+# - There’s only one GeminiClient. Each page is a GeminiItem (name is historical, as
+# it could be non-gemini content)
+# - A GeminiItem is created with an URL from which it will derives content.
+# - Content include : a title, a body, an ansi-rendered-body and a list of links.
+# - Each GeminiItem generates a "cache_path" in which it maintains a cached version of content.
+
 class GeminiItem():

    def __init__(self, url, name=""):
@ -163,6 +349,8 @@ class GeminiItem():
        self.url = fix_ipv6_url(url).strip()
        self.name = name
        self.mime = None
+        self.renderer = None
+        self.links = None
        parsed = urllib.parse.urlparse(self.url)
        if "./" in url or url[0] == "/":
            self.scheme = "localhost"
@ -201,7 +389,6 @@ class GeminiItem():
                    self.url += "/"
            if self._cache_path.endswith("/"):
                self._cache_path += index
-            
            self.port = parsed.port or standard_ports.get(self.scheme, 0)
            
    def get_title(self):
@ -279,6 +466,54 @@ class GeminiItem():
            print("ERROR: NO CACHE for %s" %self._cache_path)
            return FIXME
    
+    #def set_renderer(self,renderer):
+    #    self.renderer = renderer
+    def get_links(self):
+        if self.links:
+            return self.links
+        self.links = []
+        r_body,links = self.renderer(self.get_body())
+        for l in links:
+            #split between link and potential name
+            splitted = l.split(maxsplit=1)
+            url = self.absolutise_url(splitted[0])
+            if looks_like_url(url):
+                if len(splitted) > 1:
+                    newgi = GeminiItem(url,splitted[1])
+                else:
+                    newgi = GeminiItem(url)
+                self.links.append(newgi)
+        return self.links
+
+    # Red title above rendered content
+    def _make_terminal_title(self):
+        title = self.get_title()
+        #FIXME : how do I know that I’m offline_only ?
+        if self.is_cache_valid(): #and self.offline_only and not self.local:
+            last_modification = self.cache_last_modified()
+            str_last = time.ctime(last_modification)
+            title += "    \x1b[0;31m(last accessed on %s)"%str_last
+        rendered_title = "\x1b[31m\x1b[1m"+ title + "\x1b[0m"
+        #FIXME: width wrapped = textwrap.fill(rendered_title,self.options["width"])
+        wrapped = textwrap.fill(rendered_title,80)
+        return wrapped + "\n"
+    
+    def get_rendered_body(self):
+        if not self.renderer:
+            mime = self.get_mime()
+            if mime == "text/gemini":
+                self.renderer = render_gemtext
+            elif mime == "text/html":
+                self.renderer = render_html
+        if self.renderer:
+            body = self.get_body()
+            r_body, links = self.renderer(body)
+            to_return = self._make_terminal_title() + r_body
+            return to_return
+        else:
+            return None
+        
+
    def get_filename(self):
        filename = os.path.basename(self._cache_path)
        return filename
@ -311,12 +546,16 @@ class GeminiItem():

    def get_mime(self):
        if self.is_cache_valid():
-            mime,encoding = mimetypes.guess_type(self._cache_path,strict=False)
+            if self.local:
+                path = self.path
+            else:
+                path = self._cache_path
+            mime,encoding = mimetypes.guess_type(path,strict=False)
            #gmi Mimetype is not recognized yet
-            if not mime and self._cache_path.endswith(".gmi"):
+            if not mime and path.endswith(".gmi"):
                mime = "text/gemini"
            elif not mime and _HAS_MAGIC :
-                mime = magic.from_file(self._cache_path,mime=True)
+                mime = magic.from_file(path,mime=True)
            elif not _HAS_MAGIC :
                print("Cannot guess the mime type of the file. Install Python-magic")
            if mime.startswith("text"):
@ -339,6 +578,8 @@ class GeminiItem():
                    cache.write(str(datetime.datetime.now())+"\n")
                    cache.write("ERROR while caching %s\n" %self.url)
                    cache.write(str(err))
+                    cache.write("If you believe this error was temporary, type ""reload"".\n")
+                    cache.write("The ressource will be tentatively fetched during next sync.\n")
                    cache.write("\n")
                    cache.close()
    
@ -556,19 +797,19 @@ However, you can use `set gopher_proxy hostname:port` to tell it about a
 Gopher-to-Gemini proxy (such as a running Agena instance), in which case
 you'll be able to transparently follow links to Gopherspace!""")
            return
-        elif gi.local:
-            if os.path.exists(gi.path):
-                with open(gi.path,'r') as f:
-                    self._handle_gemtext(gi)
-                    self.gi = gi
-                    self._update_history(gi)
-                return
-            else:
-                print("Sorry, file %s does not exist."%gi.path)
-                return
-        elif gi.scheme not in ("gemini", "gopher", "http", "https") and not self.sync_only:
-            print("Sorry, no support for {} links.".format(gi.scheme))
-            return
+        #elif gi.local:
+        #    if os.path.exists(gi.path):
+        #        with open(gi.path,'r') as f:
+        #            self._handle_gemtext(gi)
+        #            self.gi = gi
+        #            self._update_history(gi)
+        #        return
+        #    else:
+        #        print("Sorry, file %s does not exist."%gi.path)
+        #        return
+        #elif gi.scheme not in ("gemini", "gopher", "http", "https") and not self.sync_only:
+        #    print("Sorry, no support for {} links.".format(gi.scheme))
+        #    return

        # Obey permanent redirects
        if gi.url in self.permanent_redirects:
@ -633,13 +874,25 @@ you'll be able to transparently follow links to Gopherspace!""")
                return

        # Pass file to handler, unless we were asked not to
-        #SPECIFIC GEMINI : default handler should be provided by the GI.
-        if gi and handle :
-            if gi.get_mime() == "text/gemini":
-                self._handle_gemtext(gi, display=not self.sync_only)
-            elif gi.get_mime() == "text/html":
-                self._handle_html(gi,display=not self.sync_only)
-            elif not self.sync_only :
+        if gi :
+            rendered_body = gi.get_rendered_body()
+            display = handle and not self.sync_only
+            if rendered_body:
+                self.index = gi.get_links()
+                self.lookup = self.index
+                self.page_index = 0
+                self.index_index = -1
+                if display:
+                    # We actually put the body in a tmpfile before giving it to less
+                    if self.idx_filename:
+                        os.unlink(self.idx_filename)
+                    tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
+                    tmpf.write(rendered_body)
+                    tmpf.close()
+                    self.idx_filename = tmpf.name
+                    cmd_str = self._get_handler_cmd("text/gemini")
+                    subprocess.run(shlex.split(cmd_str % tmpf.name))
+            elif display :
                cmd_str = self._get_handler_cmd(gi.get_mime())
                try:
                    # get tmpfile from gi !
@ -1082,210 +1335,7 @@ you'll be able to transparently follow links to Gopherspace!""")
        self._debug("Using handler: %s" % cmd_str)
        return cmd_str

-
-    # Red title above rendered content
-    def _make_terminal_title(self,gi):
-        title = gi.get_title()
-        if gi.is_cache_valid() and self.offline_only and not gi.local:
-            last_modification = gi.cache_last_modified()
-            str_last = time.ctime(last_modification)
-            title += "    \x1b[0;31m(last accessed on %s)"%str_last
-        rendered_title = "\x1b[31m\x1b[1m"+ title + "\x1b[0m"
-        wrapped = textwrap.fill(rendered_title,self.options["width"])
-        return wrapped + "\n"
-    # Our own HTML engine (crazy, isn’t it?)
-    def _handle_html(self,gi,display=True):
-        if not _DO_HTML:
-            print("HTML document detected. Please install python-bs4 and python readability.")
-            return
-        # This method recursively parse the HTML
-        def recursive_render(element,indent=""):
-            rendered_body = ""
-            #print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
-            if element.name == "blockquote":
-                for child in element.children:
-                    rendered_body +=  recursive_render(child,indent="\t").rstrip("\t")
-            elif element.name == "div":
-                rendered_body += "\n"
-                for child in element.children:
-                    rendered_body += recursive_render(child,indent=indent)
-            elif element.name in ["h1","h2","h3","h4","h5","h6"]:
-                line = element.get_text()
-                if element.name in ["h1","h2"]:
-                    rendered_body += "\n"+"\x1b[1;34m\x1b[4m" + line + "\x1b[0m"+"\n"
-                elif element.name in ["h3","h4"]:
-                    rendered_body += "\n" + "\x1b[34m" + line + "\x1b[0m" + "\n"
-                else:
-                    rendered_body += "\n" + "\x1b[34m\x1b[2m" + line + "\x1b[0m" + "\n"
-            elif element.name == "pre":
-                rendered_body += "\n"
-                for child in element.children:
-                   rendered_body += recursive_render(child,indent=indent)
-                rendered_body += "\n\n"
-            elif element.name == "li":
-                line = ""
-                for child in element.children:
-                    line += recursive_render(child,indent=indent).strip("\n")
-                    #print("in li: ***%s***"%line)
-                rendered_body += " * " + line.strip() + "\n"
-            elif element.name in ["code","em","b","i"]:
-                # we don’t do anything with those markup right now. Maybe later?
-                for child in element.children:
-                    rendered_body += recursive_render(child,indent=indent).strip("\n")
-            elif element.name == "p":
-                temp_str = ""
-                for child in element.children:
-                    temp_str += recursive_render(child,indent=indent)
-                rendered_body = temp_str + "\n\n"
-            elif element.name == "a":
-                text = element.get_text().strip()
-                link = element.get('href')
-                if link:
-                    line = "=> " + link + " " +text
-                    link_id = " [%s] "%(len(self.index)+1)
-                    temp_gi = GeminiItem.from_map_line(line, gi)
-                    if temp_gi:
-                        self.index.append(temp_gi)
-                        rendered_body = "\x1b[34m\x1b[2m " + text + link_id + "\x1b[0m"
-                    else:
-                        #No real link found
-                        rendered_body = text
-            elif element.name == "br":
-                rendered_body = "\n"
-            elif element.string:
-                #print("tag without children:",element.name)
-                #print("string : **%s** "%element.string.strip())
-                #print("########")
-                rendered_body = element.string.strip("\n").strip("\t")
-            else:
-                #print("tag children:",element.name)
-                for child in element.children:
-                    rendered_body += recursive_render(child,indent=indent)
-            #print("body for element %s: %s"%(element.name,rendered_body))
-            return indent + rendered_body
-
-        # the real _handle_html method
-        self.index = []
-        if self.idx_filename:
-            os.unlink(self.idx_filename)
-        tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
-        self.idx_filename = tmpf.name
-        tmpf.write(self._make_terminal_title(gi))
-        readable = Document(gi.get_body())
-        title = readable.short_title()
-        summary = readable.summary()
-        tmpf.write("\x1b[1;34m\x1b[4m" + title + "\x1b[0m""\n")
-        #summary = gi.get_body()
-        soup = BeautifulSoup(summary, 'html.parser')
-        rendered_body = ""
-        if soup and soup.body :
-            for el in soup.body.contents:
-                rendered_body += recursive_render(el)
-            paragraphs = rendered_body.split("\n\n")
-            for par in paragraphs:
-                lines = par.splitlines()
-                for line in lines:
-                    if line.startswith("\t"):
-                        i_indent = "   "
-                        s_indent = i_indent
-                        line = line.strip("\t")
-                    elif line.startswith(" * "):
-                        i_indent = ""  # we keep the initial bullet)
-                        s_indent = "   "
-                    else:
-                        i_indent = ""
-                        s_indent = i_indent
-                    if line.strip() != "":
-                        wrapped = textwrap.fill(line,self.options["width"],\
-                                    initial_indent=i_indent,subsequent_indent=s_indent)
-                        wrapped += "\n"
-                    else:
-                        wrapped = ""
-                    tmpf.write(wrapped)
-                tmpf.write("\n")
-        tmpf.close()
-        self.lookup = self.index
-        self.page_index = 0
-        self.index_index = -1
-        if display:
-            cmd_str = self._get_handler_cmd("text/gemini")
-            subprocess.call(shlex.split(cmd_str % self.idx_filename))
-
-    # Gemtext Rendering Engine
-    # this method renders the original Gemtext then call the handler to display it.
-    def _handle_gemtext(self, menu_gi, display=True):
-        self.index = []
-        preformatted = False
-        if self.idx_filename:
-            os.unlink(self.idx_filename)
-        # this tempfile will contains a parsed version of the gemtext
-        # to display it. This is the output, not native gemtext.
-        tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
-        self.idx_filename = tmpf.name
-        tmpf.write(self._make_terminal_title(menu_gi))
-        #This local method takes a line and apply the ansi code given as "color"
-        #The whole line is then wrapped and ansi code are ended.
-        def wrap_line(line,color=None,i_indent="",s_indent=""):
-            wrapped = textwrap.wrap(line,self.options["width"],\
-                                    initial_indent=i_indent,subsequent_indent=s_indent)
-            final = ""
-            for l in wrapped:
-                if color:
-                    l = color + l + "\x1b[0m"
-                if l.strip() != "":
-                    final += l + "\n"
-            return final
-                    
-        for line in menu_gi.get_body().splitlines():
-            if line.startswith("```"):
-                preformatted = not preformatted
-            elif preformatted:
-                tmpf.write(line + "\n")
-            elif line.startswith("=>"):
-                try:
-                    gi = GeminiItem.from_map_line(line, menu_gi)
-                    if gi:
-                        self.index.append(gi)
-                        #tmpf.write(self._format_geminiitem(len(self.index), gi) + "\n")
-                        #tentative to wrapp long links. Not sure it worth the trouble
-                        link = self._format_geminiitem(len(self.index), gi)
-                        startpos = link.find("] ") + 2
-                        wrapped = wrap_line(link,s_indent=startpos*" ")
-                        tmpf.write(wrapped)
-                    else:
-                        self._debug("Skipping possible link: %s" % line)
-                except:
-                    self._debug("Skipping possible link: %s" % line)
-            elif line.startswith("* "):
-                line = line[1:].lstrip("\t ")
-                tmpf.write(textwrap.fill(line, self.options["width"],
-                    initial_indent = "• ", subsequent_indent="  ") + "\n")
-            elif line.startswith(">"):
-                line = line[1:].lstrip("\t ")
-                tmpf.write(textwrap.fill(line, self.options["width"],
-                    initial_indent = "> ", subsequent_indent="> ") + "\n")
-            elif line.startswith("###"):
-                line = line[3:].lstrip("\t ")
-                #tmpf.write("\x1b[4m" + line + "\x1b[0m""\n")
-                tmpf.write(wrap_line(line, color="\x1b[34m\x1b[2m"))
-            elif line.startswith("##"):
-                line = line[2:].lstrip("\t ")
-                tmpf.write(wrap_line(line, color="\x1b[34m"))
-            elif line.startswith("#"):
-                line = line[1:].lstrip("\t ")
-                tmpf.write(wrap_line(line,color="\x1b[1;34m\x1b[4m"))
-            else:
-                tmpf.write(wrap_line(line).rstrip() + "\n")
-        tmpf.close()
-
-        self.lookup = self.index
-        self.page_index = 0
-        self.index_index = -1
-
-        if display:
-            cmd_str = self._get_handler_cmd("text/gemini")
-            subprocess.call(shlex.split(cmd_str % self.idx_filename))
-
+    #TODO: remove format_geminiitem
    def _format_geminiitem(self, index, gi, url=False):
        protocol = "" if gi.scheme == "gemini" else " %s" % gi.scheme
        line = "[%d%s] %s" % (index, protocol, gi.name or gi.url)
@ -1958,18 +2008,14 @@ Bookmarks are stored using the 'add' command."""
        if len(args.split()) > 1 or (args and not args.isnumeric()):
            print("bookmarks command takes a single integer argument!")
            return
-        with open(bm_file, "r") as fp:
-        #else:
-            gi = GeminiItem("localhost:/" + bm_file,"Bookmarks")
-            gi.body = fp.read()
-            # We don’t display bookmarks if accessing directly one
-            # or if in sync_only
-            display = not ( args or self.sync_only) 
-            self._handle_gemtext(gi, display = display)
-            #self._go_to_gi(gi,handle=display)
-            if args:
-                # Use argument as a numeric index
-                self.default(line)
+        gi = GeminiItem("localhost:/" + bm_file,"Bookmarks")
+        # We don’t display bookmarks if accessing directly one
+        # or if in sync_only
+        display = not ( args or self.sync_only) 
+        self._go_to_gi(gi,handle=display)
+        if args:
+            # Use argument as a numeric index
+            self.default(line)
    
    def list_add_line(self,line,list):
        list_path = os.path.join(self.config_dir, "lists/%s.gmi"%list)