offpunk/ansirenderer.py

1389 lines
53 KiB
Python
Raw Permalink Normal View History

2023-07-03 09:43:06 +00:00
#!/bin/python
import os
import sys
2023-07-03 21:48:55 +00:00
import shutil
import tempfile
import subprocess
import textwrap
import time
import html
2023-07-04 11:55:01 +00:00
import urllib
2023-07-18 10:33:30 +00:00
import argparse
import mimetypes
import fnmatch
2023-07-30 21:12:01 +00:00
import netcache
2023-07-04 11:55:01 +00:00
from offutils import run,term_width
2023-07-03 21:48:55 +00:00
try:
from readability import Document
_HAS_READABILITY = True
except ModuleNotFoundError:
_HAS_READABILITY = False
try:
from bs4 import BeautifulSoup
from bs4 import Comment
_HAS_SOUP = True
except ModuleNotFoundError:
_HAS_SOUP = False
_DO_HTML = _HAS_SOUP #and _HAS_READABILITY
if _DO_HTML and not _HAS_READABILITY:
print("To improve your web experience (less cruft in webpages),")
print("please install python3-readability or readability-lxml")
try:
import feedparser
_DO_FEED = True
except ModuleNotFoundError:
_DO_FEED = False
less_version = 0
if not shutil.which("less"):
print("Please install the pager \"less\" to run Offpunk.")
2023-07-31 15:56:08 +00:00
print("If you wish to use another pager, send me an email!")
2023-07-03 21:48:55 +00:00
print("(Im really curious to hear about people not having \"less\" on their system.)")
sys.exit()
output = run("less --version")
# We get less Version (which is the only integer on the first line)
words = output.split("\n")[0].split()
less_version = 0
for w in words:
if w.isdigit():
less_version = int(w)
# restoring position only works for version of less > 572
if less_version >= 572:
_LESS_RESTORE_POSITION = True
else:
_LESS_RESTORE_POSITION = False
#_DEFAULT_LESS = "less -EXFRfM -PMurl\ lines\ \%lt-\%lb/\%L\ \%Pb\%$ %s"
# -E : quit when reaching end of file (to behave like "cat")
# -F : quit if content fits the screen (behave like "cat")
# -X : does not clear the screen
# -R : interpret ANSI colors correctly
# -f : suppress warning for some contents
# -M : long prompt (to have info about where you are in the file)
# -W : hilite the new first line after a page skip (space)
# -i : ignore case in search
# -S : do not wrap long lines. Wrapping is done by offpunk, longlines
# are there on purpose (surch in asciiart)
#--incsearch : incremental search starting rev581
if less_version >= 581:
less_base = "less --incsearch --save-marks -~ -XRfMWiS"
elif less_version >= 572:
less_base = "less --save-marks -XRfMWiS"
else:
less_base = "less -XRfMWiS"
_DEFAULT_LESS = less_base + " \"+''\" %s"
_DEFAULT_CAT = less_base + " -EF %s"
def less_cmd(file, histfile=None,cat=False,grep=None):
if histfile:
env = {"LESSHISTFILE": histfile}
else:
env = {}
if cat:
cmd_str = _DEFAULT_CAT
elif grep:
grep_cmd = _GREP
#case insensitive for lowercase search
if grep.islower():
grep_cmd += " -i"
cmd_str = _DEFAULT_CAT + "|" + grep_cmd + " %s"%grep
else:
cmd_str = _DEFAULT_LESS
run(cmd_str, parameter=file, direct_output=True, env=env)
2023-07-03 09:43:06 +00:00
2023-07-04 11:55:01 +00:00
try:
from PIL import Image
_HAS_PIL = True
except ModuleNotFoundError:
_HAS_PIL = False
_HAS_TIMG = shutil.which('timg')
_HAS_CHAFA = shutil.which('chafa')
_NEW_CHAFA = False
_NEW_TIMG = False
_RENDER_IMAGE = False
# All this code to know if we render image inline or not
if _HAS_CHAFA:
# starting with 1.10, chafa can return only one frame
# which allows us to drop dependancy for PIL
output = run("chafa --version")
# output is "Chafa version M.m.p"
# check for m < 1.10
try:
chafa_major, chafa_minor, _ = output.split("\n")[0].split(" ")[-1].split(".")
if int(chafa_major) >= 1 and int(chafa_minor) >= 10:
_NEW_CHAFA = True
except:
pass
if _NEW_CHAFA :
_RENDER_IMAGE = True
if _HAS_TIMG :
try:
output = run("timg --version")
except subprocess.CalledProcessError:
output = False
# We dont deal with timg before 1.3.2 (looping options)
if output and output[5:10] > "1.3.2":
_NEW_TIMG = True
_RENDER_IMAGE = True
elif _HAS_CHAFA and _HAS_PIL:
_RENDER_IMAGE = True
if not _RENDER_IMAGE:
print("To render images inline, you need either chafa or timg.")
if not _NEW_CHAFA and not _NEW_TIMG:
print("Before Chafa 1.10, you also need python-pil")
# This method return the image URL or invent it if its a base64 inline image
# It returns [url,image_data] where image_data is None for normal image
def looks_like_base64(src,baseurl):
imgdata = None
imgname = src
if src and src.startswith("data:image/"):
if ";base64," in src:
splitted = src.split(";base64,")
extension = splitted[0].strip("data:image/")[:3]
imgdata = splitted[1]
imgname = imgdata[:20] + "." + extension
imgurl = urllib.parse.urljoin(baseurl, imgname)
else:
#We cant handle other data:image such as svg for now
imgurl = None
else:
imgurl = urllib.parse.urljoin(baseurl, imgname)
return imgurl,imgdata
2023-07-03 09:43:06 +00:00
#return ANSItext that can be show by less
def inline_image(img_file,width):
#Chafa is faster than timg inline. Let use that one by default
inline = None
ansi_img = ""
#We avoid errors by not trying to render non-image files
if shutil.which("file"):
mime = run("file -b --mime-type %s", parameter=img_file).strip()
if not "image" in mime:
return ansi_img
if _HAS_CHAFA:
if _HAS_PIL and not _NEW_CHAFA:
# this code is a hack to remove frames from animated gif
img_obj = Image.open(img_file)
if hasattr(img_obj,"n_frames") and img_obj.n_frames > 1:
# we remove all frames but the first one
img_obj.save(img_file,format="gif",save_all=False)
inline = "chafa --bg white -s %s -f symbols"
elif _NEW_CHAFA:
inline = "chafa --bg white -t 1 -s %s -f symbols --animate=off"
if not inline and _NEW_TIMG:
inline = "timg --frames=1 -p q -g %sx1000"
if inline:
cmd = inline%width + " %s"
try:
ansi_img = run(cmd, parameter=img_file)
except Exception as err:
ansi_img = "***image failed : %s***\n" %err
return ansi_img
def terminal_image(img_file):
#Render by timg is better than old chafa.
# it is also centered
cmd = None
if _NEW_TIMG:
cmd = "timg --loops=1 -C"
elif _HAS_CHAFA:
cmd = "chafa -d 0 --bg white -t 1 -w 1"
if cmd:
cmd = cmd + " %s"
run(cmd, parameter=img_file, direct_output=True)
2023-07-03 09:43:06 +00:00
# First, we define the different content->text renderers, outside of the rest
# (They could later be factorized in other files or replaced)
class AbstractRenderer():
def __init__(self,content,url,center=True):
self.url = url
self.body = str(content)
#theres one rendered text and one links table per mode
self.rendered_text = {}
self.links = {}
self.images = {}
self.title = None
self.validity = True
2023-07-31 07:34:12 +00:00
self.temp_files = {}
2023-07-03 09:43:06 +00:00
self.less_histfile = {}
self.center = center
2023-07-31 07:34:12 +00:00
self.last_mode = "readable"
2023-07-03 09:43:06 +00:00
#This class hold an internal representation of the HTML text
class representation:
def __init__(self,width,title=None,center=True):
self.title=title
self.center = center
self.final_text = ""
self.opened = []
self.width = width
self.last_line = ""
self.last_line_colors = {}
self.last_line_center = False
self.new_paragraph = True
self.i_indent = ""
self.s_indent = ""
self.r_indent = ""
self.current_indent = ""
self.disabled_indents = None
# each color is an [open,close] pair code
self.colors = {
"bold" : ["1","22"],
"faint" : ["2","22"],
"italic" : ["3","23"],
"underline": ["4","24"],
"red" : ["31","39"],
"yellow" : ["33","39"],
"blue" : ["34","39"],
}
def _insert(self,color,open=True):
if open: o = 0
else: o = 1
pos = len(self.last_line)
#we remember the position where to insert color codes
if not pos in self.last_line_colors:
self.last_line_colors[pos] = []
#Two inverse code cancel each other
if [color,int(not o)] in self.last_line_colors[pos]:
self.last_line_colors[pos].remove([color,int(not o)])
else:
self.last_line_colors[pos].append([color,o])#+color+str(o))
# Take self.last line and add ANSI codes to it before adding it to
# self.final_text.
def _endline(self):
if len(self.last_line.strip()) > 0:
for c in self.opened:
self._insert(c,open=False)
nextline = ""
added_char = 0
#we insert the color code at the saved positions
while len (self.last_line_colors) > 0:
pos,colors = self.last_line_colors.popitem()
#popitem itterates LIFO.
#So we go, backward, to the pos (starting at the end of last_line)
nextline = self.last_line[pos:] + nextline
ansicol = "\x1b["
for c,o in colors:
ansicol += self.colors[c][o] + ";"
ansicol = ansicol[:-1]+"m"
nextline = ansicol + nextline
added_char += len(ansicol)
self.last_line = self.last_line[:pos]
nextline = self.last_line + nextline
if self.last_line_center:
#we have to care about the ansi char while centering
width = term_width() + added_char
nextline = nextline.strip().center(width)
self.last_line_center = False
else:
#should we lstrip the nextline in the addition ?
nextline = self.current_indent + nextline.lstrip() + self.r_indent
self.current_indent = self.s_indent
self.final_text += nextline
self.last_line = ""
self.final_text += "\n"
for c in self.opened:
self._insert(c,open=True)
else:
self.last_line = ""
def center_line(self):
self.last_line_center = True
def open_color(self,color):
if color in self.colors and color not in self.opened:
self._insert(color,open=True)
self.opened.append(color)
def close_color(self,color):
if color in self.colors and color in self.opened:
self._insert(color,open=False)
self.opened.remove(color)
def close_all(self):
if len(self.colors) > 0:
self.last_line += "\x1b[0m"
self.opened.clear()
def startindent(self,indent,sub=None,reverse=None):
self._endline()
self.i_indent = indent
self.current_indent = indent
if sub:
self.s_indent = sub
else:
self.s_indent = indent
if reverse:
self.r_indent = reverse
else:
self.r_indent = ""
def endindent(self):
self._endline()
self.i_indent = ""
self.s_indent = ""
self.r_indent = ""
self.current_indent = ""
def _disable_indents(self):
self.disabled_indents = []
self.disabled_indents.append(self.current_indent)
self.disabled_indents.append(self.i_indent)
self.disabled_indents.append(self.s_indent)
self.disabled_indents.append(self.r_indent)
self.endindent()
def _enable_indents(self):
if self.disabled_indents:
self.current_indent = self.disabled_indents[0]
self.i_indent = self.disabled_indents[1]
self.s_indent = self.disabled_indents[2]
self.r_indent = self.disabled_indents[3]
self.disabled_indents = None
def newline(self):
self._endline()
#A new paragraph implies 2 newlines (1 blank line between paragraphs)
#But it is only used if didnt already started one to avoid plenty
#of blank lines. force=True allows to bypass that limit.
#new_paragraph becomes false as soon as text is entered into it
def newparagraph(self,force=False):
if force or not self.new_paragraph:
self._endline()
self.final_text += "\n"
self.new_paragraph = True
def add_space(self):
if len(self.last_line) > 0 and self.last_line[-1] != " ":
self.last_line += " "
def _title_first(self,intext=None):
if self.title:
if not self.title == intext:
self._disable_indents()
self.open_color("blue")
self.open_color("bold")
self.open_color("underline")
self.add_text(self.title)
self.close_all()
self.newparagraph()
self._enable_indents()
self.title = None
# Beware, blocks are not wrapped nor indented and left untouched!
# They are mostly useful for pictures and preformatted text.
def add_block(self,intext):
# If necessary, we add the title before a block
self._title_first()
# we dont want to indent blocks
self._endline()
self._disable_indents()
self.final_text += self.current_indent + intext
self.new_paragraph = False
self._endline()
self._enable_indents()
def add_text(self,intext):
self._title_first(intext=intext)
lines = []
last = (self.last_line + intext)
self.last_line = ""
# With the following, we basically cancel adding only spaces
# on an empty line
if len(last.strip()) > 0:
self.new_paragraph = False
else:
last = last.strip()
if len(last) > self.width:
width = self.width - len(self.current_indent) - len(self.r_indent)
spaces_left = len(last) - len(last.lstrip())
spaces_right = len(last) - len(last.rstrip())
lines = textwrap.wrap(last,width,drop_whitespace=True)
self.last_line += spaces_left*" "
while len(lines) > 1:
l = lines.pop(0)
self.last_line += l
self._endline()
if len(lines) == 1:
li = lines[0]
self.last_line += li + spaces_right*" "
else:
self.last_line = last
def get_final(self):
self.close_all()
self._endline()
#if no content, we still add the title
self._title_first()
lines = self.final_text.splitlines()
lines2 = []
termspace = shutil.get_terminal_size()[0]
#Following code instert blanck spaces to center the content
if self.center and termspace > term_width():
margin = int((termspace - term_width())//2)
else:
margin = 0
for l in lines :
lines2.append(margin*" "+l)
return "\n".join(lines2)
def get_subscribe_links(self):
return [[self.url,self.get_mime(),self.get_title()]]
def is_valid(self):
return self.validity
2023-07-31 07:34:12 +00:00
def is_local(self):
#TODO with self.url
return False
def set_mode(self,mode):
self.last_mode = mode
2023-07-31 07:34:12 +00:00
def get_links(self,mode=None):
2023-08-03 14:54:29 +00:00
# This method is used to load once the list of links in a gi
# Links can be followed, after a space, by a description/title
#TODO: remove this code
# def get_links(self,mode=None):
# links = []
# toreturn = []
# if self.renderer:
# if not mode:
# mode = self.renderer.last_mode
# links = self.renderer.get_links(mode=mode)
# for l in links:
# #split between link and potential name
# # check that l is non-empty
# url = None
# if l:
# splitted = l.split(maxsplit=1)
# url = self.absolutise_url(splitted[0])
# if url and looks_like_url(url):
# if len(splitted) > 1:
# #We add a name only for Gopher items
# if url.startswith("gopher://"):
# newgi = GeminiItem(url,name=splitted[1])
# else:
# newgi = GeminiItem(url)
# else:
# newgi = GeminiItem(url)
# toreturn.append(newgi)
# elif url and mode != "links_only" and url.startswith("data:image/"):
# imgurl,imgdata = ansirenderer.looks_like_base64(url,self.url)
# if imgurl:
# toreturn.append(GeminiItem(imgurl))
# else:
# toreturn.append(None)
# else:
# # We must include a None item to keep the link count valid
# toreturn.append(None)
# return toreturn
2023-07-31 07:34:12 +00:00
if not mode: mode = self.last_mode
2023-07-03 09:43:06 +00:00
if mode not in self.links :
prepared_body = self.prepare(self.body,mode=mode)
results = self.render(prepared_body,mode=mode)
if results:
2023-08-03 14:54:29 +00:00
#we should absolutize all URLs here
self.links[mode] = []
for l in results[1]:
abs_l = urllib.parse.urljoin(self.url,l.split()[0])
self.links[mode].append(abs_l)
2023-07-03 09:43:06 +00:00
for l in self.get_subscribe_links()[1:]:
self.links[mode].append(l[0])
return self.links[mode]
2023-07-30 21:35:34 +00:00
def get_link(self,nb):
links = self.get_links()
if len(links) < nb:
print("Index too high! No link %s for %s" %(nb,self.url))
return None
else:
return links[nb-1]
2023-07-21 15:33:55 +00:00
#get_title is about the "content title", so the title in the page itself
2023-07-03 09:43:06 +00:00
def get_title(self):
return "Abstract title"
2023-07-30 14:59:32 +00:00
def get_page_title(self):
title = self.get_title()
if not title or len(title) == 0:
title = self.get_url_title()
else:
title += " (%s)" %self.get_url_title()
return title
2023-07-21 15:33:55 +00:00
#this function is about creating a title derived from the URL
def get_url_title(self):
#small intelligence to try to find a good name for a capsule
#we try to find eithe ~username or /users/username
#else we fallback to hostname
#TODO: handle local name
# if self.local:
# if self.name != "":
# red_title = self.name
# else:
# red_title = self.path
# else:
#TODO: handle host and path separation
red_title = "TODO:host" #self.host
path = self.url
if "user" in path:
i = 0
splitted = path.split("/")
while i < (len(splitted)-1):
if splitted[i].startswith("user"):
red_title = splitted[i+1]
i += 1
if "~" in path:
for pp in path.split("/"):
if pp.startswith("~"):
red_title = pp[1:]
return red_title
2023-07-03 09:43:06 +00:00
# This function return a list of URL which should be downloaded
# before displaying the page (images in HTML pages, typically)
def get_images(self,mode=None):
if not mode: mode = self.last_mode
2023-07-03 09:43:06 +00:00
if not mode in self.images:
self.get_body(mode=mode)
# we also invalidate the body that was done without images
self.rendered_text.pop(mode)
if mode in self.images:
return self.images[mode]
else:
return []
#This function will give gemtext to the gemtext renderer
def prepare(self,body,mode=None):
return body
def get_body(self,width=None,mode=None):
if not mode: mode = self.last_mode
2023-07-03 09:43:06 +00:00
if not width:
width = term_width()
if mode not in self.rendered_text:
prepared_body = self.prepare(self.body,mode=mode)
result = self.render(prepared_body,width=width,mode=mode)
if result:
self.rendered_text[mode] = result[0]
2023-08-03 14:54:29 +00:00
#The following is there to prepoulate self.links
#but it seems to slow down a lot the loading
#self.links[mode] = []
#we should absolutize all URLs here
#for l in result[1]:
# abs_l = urllib.parse.urljoin(self.url,l.split()[0])
# self.links[mode].append(abs_l)
2023-07-03 09:43:06 +00:00
return self.rendered_text[mode]
def _window_title(self,title,info=None):
title_r = self.representation(term_width())
title_r.open_color("red")
title_r.open_color("bold")
title_r.add_text(title)
title_r.close_color("bold")
if info:
title_r.add_text(" (%s)"%info)
title_r.close_color("red")
return title_r.get_final()
2023-07-31 07:34:12 +00:00
def display(self,mode=None,window_title="",window_info=None,grep=None):
if mode: self.last_mode = mode
else: mode = self.last_mode
2023-07-03 09:43:06 +00:00
wtitle = self._window_title(window_title,info=window_info)
body = wtitle + "\n" + self.get_body(mode=mode)
if not body:
return False
# We actually put the body in a tmpfile before giving it to less
2023-07-31 07:34:12 +00:00
if mode not in self.temp_files:
2023-07-03 09:43:06 +00:00
tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
2023-07-31 07:34:12 +00:00
self.temp_files[mode] = tmpf.name
2023-07-03 09:43:06 +00:00
tmpf.write(body)
tmpf.close()
if mode not in self.less_histfile:
firsttime = True
tmpf = tempfile.NamedTemporaryFile("w", encoding="UTF-8", delete=False)
self.less_histfile[mode] = tmpf.name
else:
firsttime = False
2023-07-31 07:34:12 +00:00
less_cmd(self.temp_files[mode], histfile=self.less_histfile[mode],cat=firsttime,grep=grep)
2023-07-03 09:43:06 +00:00
return True
def get_temp_file(self,mode=None):
if not mode: mode = self.last_mode
2023-07-31 07:34:12 +00:00
if mode in self.temp_files:
return self.temp_files[mode]
2023-07-03 09:43:06 +00:00
else:
return None
# An instance of AbstractRenderer should have a self.render(body,width,mode) method.
# 3 modes are used :readable (by default), full and links_only (the fastest, when
# rendered content is not used, only the links are needed)
# The prepare() function is called before the rendering. It is useful if
# your renderer output in a format suitable for another existing renderer (such as gemtext)
# Gemtext Rendering Engine
class GemtextRenderer(AbstractRenderer):
def get_mime(self):
return "text/gemini"
def get_title(self):
if self.title:
return self.title
elif self.body:
lines = self.body.splitlines()
for line in lines:
if line.startswith("#"):
self.title = line.strip("#").strip()
return self.title
if len(lines) > 0:
# If not title found, we take the first 50 char
# of the first line
title_line = lines[0].strip()
if len(title_line) > 50:
title_line = title_line[:49] + ""
self.title = title_line
return self.title
else:
self.title = "Empty Page"
return self.title
else:
return "Unknown Gopher Page"
#render_gemtext
def render(self,gemtext, width=None,mode=None):
if not width:
width = term_width()
r = self.representation(width)
links = []
hidden_links = []
preformatted = False
def format_link(url,index,name=None):
if "://" in url:
protocol,adress = url.split("://",maxsplit=1)
protocol = " %s" %protocol
else:
adress = url
protocol = ""
if "gemini" in protocol or "list" in protocol:
protocol = ""
if not name:
name = adress
line = "[%d%s] %s" % (index, protocol, name)
return line
for line in gemtext.splitlines():
r.newline()
if line.startswith("```"):
preformatted = not preformatted
elif preformatted:
# infinite line to not wrap preformated
r.add_block(line+"\n")
elif len(line.strip()) == 0:
r.newparagraph(force=True)
elif line.startswith("=>"):
strippedline = line[2:].strip()
if strippedline:
links.append(strippedline)
splitted = strippedline.split(maxsplit=1)
url = splitted[0]
name = None
if len(splitted) > 1:
name = splitted[1]
link = format_link(url,len(links),name=name)
#r.open_color("blue")
#r.open_color("faint")
#r.open_color("underline")
startpos = link.find("] ") + 2
r.startindent("",sub=startpos*" ")
r.add_text(link)
r.endindent()
#r.close_all()
elif line.startswith("* "):
line = line[1:].lstrip("\t ")
r.startindent("",sub=" ")
r.add_text(line)
r.endindent()
elif line.startswith(">"):
line = line[1:].lstrip("\t ")
r.startindent("> ")
r.add_text(line)
r.endindent()
elif line.startswith("###"):
line = line[3:].lstrip("\t ")
r.open_color("blue")
r.add_text(line)
r.close_color("blue")
elif line.startswith("##"):
line = line[2:].lstrip("\t ")
r.open_color("blue")
r.add_text(line)
r.close_color("blue")
elif line.startswith("#"):
line = line[1:].lstrip("\t ")
if not self.title:
self.title = line
r.open_color("bold")
r.open_color("blue")
r.open_color("underline")
r.add_text(line)
r.close_color("underline")
r.close_color("bold")
r.close_color("blue")
else:
if "://" in line:
words = line.split()
for w in words:
if "://" in w:
hidden_links.append(w)
r.add_text(line.rstrip())
links += hidden_links
return r.get_final(), links
class GopherRenderer(AbstractRenderer):
def get_mime(self):
return "text/gopher"
def get_title(self):
if not self.title:
self.title = ""
if self.body:
firstline = self.body.splitlines()[0]
firstline = firstline.split("\t")[0]
if firstline.startswith("i"):
firstline = firstline[1:]
self.title = firstline
return self.title
#menu_or_text
def render(self,body,width=None,mode=None):
if not width:
width = term_width()
try:
render,links = self._render_goph(body,width=width,mode=mode)
except Exception as err:
print("Error rendering Gopher ",err)
r = self.representation(width)
r.add_block(body)
render = r.get_final()
links = []
return render,links
def _render_goph(self,body,width=None,mode=None):
if not width:
width = term_width()
# This was copied straight from Agena (then later adapted)
links = []
r = self.representation(width)
for line in self.body.split("\n"):
r.newline()
if line.startswith("i"):
towrap = line[1:].split("\t")[0]
if len(towrap.strip()) > 0:
r.add_text(towrap)
else:
r.newparagraph()
elif not line.strip() in [".",""]:
parts = line.split("\t")
parts[-1] = parts[-1].strip()
if parts[-1] == "+":
parts = parts[:-1]
if len(parts) == 4:
name,path,host,port = parts
itemtype = name[0]
name = name[1:]
if port == "70":
port = ""
else:
port = ":%s"%port
if itemtype == "h" and path.startswith("URL:"):
url = path[4:]
else:
url = "gopher://%s%s/%s%s" %(host,port,itemtype,path)
url = url.replace(" ","%20")
linkline = url + " " + name
links.append(linkline)
towrap = "[%s] "%len(links)+ name
r.add_text(towrap)
else:
r.add_text(line)
return r.get_final(),links
class FolderRenderer(GemtextRenderer):
#it was initialized with:
#self.renderer = FolderRenderer("",self.get_cache_path(),datadir=_DATA_DIR)
2023-07-03 09:43:06 +00:00
def __init__(self,content,url,center=True,datadir=None):
GemtextRenderer.__init__(self,content,url,center)
self.datadir = datadir
def get_mime(self):
return "Directory"
def prepare(self,body,mode=None):
def get_first_line(l):
path = os.path.join(listdir,l+".gmi")
with open(path) as f:
first_line = f.readline().strip()
f.close()
if first_line.startswith("#"):
return first_line
else:
return None
def write_list(l):
body = ""
for li in l:
path = "list:///%s"%li
2023-07-03 21:48:55 +00:00
#TODO : size of lists
#gi = GeminiItem(path)
#size = len(gi.get_links())
size = "TODO"
2023-07-03 09:43:06 +00:00
body += "=> %s %s (%s items)\n" %(str(path),li,size)
return body
listdir = os.path.join(self.datadir,"lists")
if self.url != listdir:
return "This is folder %s" %self.url
else:
self.title = "My lists"
lists = []
if os.path.exists(listdir):
listfiles = os.listdir(listdir)
if len(listfiles) > 0:
for l in listfiles:
#removing the .gmi at the end of the name
lists.append(l[:-4])
if len(lists) > 0:
body = ""
my_lists = []
system_lists = []
subscriptions = []
frozen = []
lists.sort()
for l in lists:
if l in ["history","to_fetch","archives","tour"]:
system_lists.append(l)
else:
first_line = get_first_line(l)
if first_line and "#subscribed" in first_line:
subscriptions.append(l)
elif first_line and "#frozen" in first_line:
frozen.append(l)
else:
my_lists.append(l)
if len(my_lists) > 0:
body+= "\n## Bookmarks Lists (updated during sync)\n"
body += write_list(my_lists)
if len(subscriptions) > 0:
body +="\n## Subscriptions (new links in those are added to tour)\n"
body += write_list(subscriptions)
if len(frozen) > 0:
body +="\n## Frozen (fetched but never updated)\n"
body += write_list(frozen)
if len(system_lists) > 0:
body +="\n## System Lists\n"
body += write_list(system_lists)
return body
class FeedRenderer(GemtextRenderer):
def get_mime(self):
return "application/rss+xml"
def is_valid(self):
if _DO_FEED:
parsed = feedparser.parse(self.body)
else:
return False
if parsed.bozo:
return False
else:
#If no content, then fallback to HTML
return len(parsed.entries) > 0
def get_title(self):
if not self.title:
self.get_body()
return self.title
def prepare(self,content,mode=None,width=None):
if not mode: mode = self.last_mode
2023-07-03 09:43:06 +00:00
if not width:
width = term_width()
self.title = "RSS/Atom feed"
page = ""
if _DO_FEED:
parsed = feedparser.parse(content)
else:
page += "Please install python-feedparser to handle RSS/Atom feeds\n"
self.validity = False
return page
if parsed.bozo:
page += "Invalid RSS feed\n\n"
page += str(parsed.bozo_exception)
self.validity = False
else:
if "title" in parsed.feed:
t = parsed.feed.title
else:
t = "Unknown"
self.title = "%s (XML feed)" %t
title = "# %s"%self.title
page += title + "\n"
if "updated" in parsed.feed:
page += "Last updated on %s\n\n" %parsed.feed.updated
if "subtitle" in parsed.feed:
page += parsed.feed.subtitle + "\n"
if "link" in parsed.feed:
page += "=> %s\n" %parsed.feed.link
page += "\n## Entries\n"
if len(parsed.entries) < 1:
self.validity = False
for i in parsed.entries:
line = "=> %s " %i.link
if "published" in i:
pub_date = time.strftime("%Y-%m-%d",i.published_parsed)
line += pub_date + " : "
if "title" in i:
line += "%s" %(i.title)
if "author" in i:
line += " (by %s)"%i.author
page += line + "\n"
if mode == "full":
if "summary" in i:
html = HtmlRenderer(i.summary,self.url,center=False)
rendered = html.get_body(width=None,mode="full")
page += "\n"
page += rendered
page += "\n------------\n\n"
return page
class ImageRenderer(AbstractRenderer):
def get_mime(self):
return "image/*"
def is_valid(self):
if _RENDER_IMAGE:
return True
else:
return False
def get_links(self,mode=None):
return []
def get_title(self):
return "Picture file"
def render(self,img,width=None,mode=None):
#with inline, we use symbols to be rendered with less.
#else we use the best possible renderer.
if mode == "links_only":
return "", []
if not width:
width = term_width()
spaces = 0
else:
spaces = int((term_width() - width)//2)
ansi_img = inline_image(img,width)
#Now centering the image
lines = ansi_img.splitlines()
new_img = ""
for l in lines:
new_img += spaces*" " + l + "\n"
return new_img, []
def display(self,mode=None,window_title=None,window_info=None,grep=None):
if window_title:
print(self._window_title(window_title,info=window_info))
terminal_image(self.body)
return True
class HtmlRenderer(AbstractRenderer):
def get_mime(self):
return "text/html"
def is_valid(self):
if not _DO_HTML:
print("HTML document detected. Please install python-bs4 and python-readability.")
return _DO_HTML and self.validity
def get_subscribe_links(self):
subs = [[self.url,self.get_mime(),self.get_title()]]
soup = BeautifulSoup(self.body, 'html.parser')
links = soup.find_all("link",rel="alternate",recursive=True)
for l in links:
ty = l.get("type")
if ty :
if "rss" in ty or "atom" in ty or "feed" in ty:
2023-08-03 14:54:29 +00:00
# some rss links are relatives: we absolutise_url
sublink = urllib.parse.urljoin(self.url, l.get("href"))
2023-08-03 21:17:12 +00:00
subs.append([sublink,ty,l.get("title")])
2023-07-03 09:43:06 +00:00
return subs
def get_title(self):
if self.title:
return self.title
elif self.body:
if _HAS_READABILITY:
try:
readable = Document(self.body)
self.title = readable.short_title()
return self.title
except Exception as err:
pass
soup = BeautifulSoup(self.body,"html.parser")
self.title = str(soup.title.string)
else:
return ""
# Our own HTML engine (crazy, isnt it?)
# Return [rendered_body, list_of_links]
# mode is either links_only, readable or full
def render(self,body,mode=None,width=None,add_title=True):
if not mode: mode = self.last_mode
2023-07-03 09:43:06 +00:00
if not width:
width = term_width()
if not _DO_HTML:
print("HTML document detected. Please install python-bs4 and python-readability.")
return
# This method recursively parse the HTML
r = self.representation(width,title=self.get_title(),center=self.center)
links = []
# You know how bad html is when you realize that space sometimes meaningful, somtimes not.
# CR are not meaniningful. Except that, somethimes, they should be interpreted as spaces.
# HTMLis real crap. At least the one people are generating.
def render_image(src,width=40,mode=None):
ansi_img = ""
imgurl,imgdata = looks_like_base64(src,self.url)
if _RENDER_IMAGE and mode != "links_only" and imgurl:
try:
#4 followings line are there to translate the URL into cache path
2023-07-30 21:12:01 +00:00
img = netcache.get_cache_path(imgurl)
2023-07-03 09:43:06 +00:00
if imgdata:
with open(img,"wb") as cached:
cached.write(base64.b64decode(imgdata))
cached.close()
2023-07-30 21:12:01 +00:00
if netcache.is_cache_valid(img):
2023-07-03 09:43:06 +00:00
renderer = ImageRenderer(img,imgurl)
# Image are 40px wide except if terminal is smaller
if width > 40:
size = 40
else:
size = width
ansi_img = "\n" + renderer.get_body(width=size,mode="inline")
except Exception as err:
#we sometimes encounter really bad formatted files or URL
ansi_img = textwrap.fill("[BAD IMG] %s - %s"%(err,src),width) + "\n"
return ansi_img
def sanitize_string(string):
#never start with a "\n"
#string = string.lstrip("\n")
string = string.replace("\r","").replace("\n", " ").replace("\t"," ")
endspace = string.endswith(" ") or string.endswith("\xa0")
startspace = string.startswith(" ") or string.startswith("\xa0")
toreturn = string.replace("\n", " ").replace("\t"," ").strip()
while " " in toreturn:
toreturn = toreturn.replace(" "," ")
toreturn = html.unescape(toreturn)
if endspace and not toreturn.endswith(" ") and not toreturn.endswith("\xa0"):
toreturn += " "
if startspace and not toreturn.startswith(" ") and not toreturn.startswith("\xa0"):
toreturn = " " + toreturn
return toreturn
def recursive_render(element,indent="",preformatted=False):
if element.name == "blockquote":
r.newparagraph()
r.startindent(" ",reverse=" ")
for child in element.children:
r.open_color("italic")
recursive_render(child,indent="\t")
r.close_color("italic")
r.endindent()
elif element.name in ["div","p"]:
r.newparagraph()
for child in element.children:
recursive_render(child,indent=indent)
r.newparagraph()
elif element.name in ["span"]:
r.add_space()
for child in element.children:
recursive_render(child,indent=indent)
r.add_space()
elif element.name in ["h1","h2","h3","h4","h5","h6"]:
r.open_color("blue")
if element.name in ["h1"]:
r.open_color("bold")
r.open_color("underline")
elif element.name in ["h2"]:
r.open_color("bold")
elif element.name in ["h5","h6"]:
r.open_color("faint")
for child in element.children:
r.newparagraph()
recursive_render(child)
r.newparagraph()
r.close_all()
elif element.name in ["code","tt"]:
for child in element.children:
recursive_render(child,indent=indent,preformatted=True)
elif element.name in ["pre"]:
r.newparagraph()
r.add_block(element.text)
r.newparagraph()
elif element.name in ["li"]:
r.startindent("",sub=" ")
for child in element.children:
recursive_render(child,indent=indent)
r.endindent()
elif element.name in ["tr"]:
r.startindent("|",reverse="|")
for child in element.children:
recursive_render(child,indent=indent)
r.endindent()
elif element.name in ["td","th"]:
r.add_text("| ")
for child in element.children:
recursive_render(child)
r.add_text(" |")
# italics
elif element.name in ["em","i"]:
r.open_color("italic")
for child in element.children:
recursive_render(child,indent=indent,preformatted=preformatted)
r.close_color("italic")
#bold
elif element.name in ["b","strong"]:
r.open_color("bold")
for child in element.children:
recursive_render(child,indent=indent,preformatted=preformatted)
r.close_color("bold")
elif element.name == "a":
link = element.get('href')
# support for images nested in links
if link:
text = ""
imgtext = ""
#we display images first in a link
for child in element.children:
if child.name == "img":
recursive_render(child)
imgtext = "[IMG LINK %s]"
links.append(link+" "+text)
link_id = str(len(links))
r.open_color("blue")
r.open_color("faint")
for child in element.children:
if child.name != "img":
recursive_render(child,preformatted=preformatted)
if imgtext != "":
r.center_line()
r.add_text(imgtext%link_id)
else:
r.add_text(" [%s]"%link_id)
r.close_color("blue")
r.close_color("faint")
else:
#No real link found
for child in element.children:
recursive_render(child,preformatted=preformatted)
elif element.name == "img":
src = element.get("src")
text = ""
ansi_img = render_image(src,width=width,mode=mode)
alt = element.get("alt")
if alt:
alt = sanitize_string(alt)
text += "[IMG] %s"%alt
else:
text += "[IMG]"
if src:
links.append(src+" "+text)
if not mode in self.images:
self.images[mode] = []
abs_url = urllib.parse.urljoin(self.url, src)
self.images[mode].append(abs_url)
link_id = " [%s]"%(len(links))
r.add_block(ansi_img)
r.open_color("faint")
r.open_color("yellow")
r.center_line()
r.add_text(text + link_id)
r.close_color("faint")
r.close_color("yellow")
r.newline()
elif element.name == "br":
r.newline()
elif element.name not in ["script","style","template"] and type(element) != Comment:
if element.string:
if preformatted :
r.open_color("faint")
r.add_text(element.string)
r.close_color("faint")
else:
s = sanitize_string(element.string)
if len(s.strip()) > 0:
r.add_text(s)
else:
for child in element.children:
recursive_render(child,indent=indent)
# the real render_html hearth
if mode == "full":
summary = body
elif _HAS_READABILITY:
try:
readable = Document(body)
summary = readable.summary()
except Exception as err:
summary = body
else:
summary = body
soup = BeautifulSoup(summary, 'html.parser')
#soup = BeautifulSoup(summary, 'html5lib')
if soup :
if soup.body :
recursive_render(soup.body)
else:
recursive_render(soup)
return r.get_final(),links
# Mapping mimetypes with renderers
# (any content with a mimetype text/* not listed here will be rendered with as GemText)
_FORMAT_RENDERERS = {
"text/gemini": GemtextRenderer,
"text/html" : HtmlRenderer,
"text/xml" : FeedRenderer,
"application/xml" : FeedRenderer,
"application/rss+xml" : FeedRenderer,
"application/atom+xml" : FeedRenderer,
"text/gopher": GopherRenderer,
"image/*": ImageRenderer
}
def get_mime(path):
#Beware, this one is really a shaddy ad-hoc function
if path.startswith("mailto:"):
mime = "mailto"
elif os.path.isdir(path):
mime = "Local Folder"
elif path.endswith(".gmi"):
mime = "text/gemini"
elif shutil.which("file") :
mime = run("file -b --mime-type %s", parameter=path).strip()
mime2,encoding = mimetypes.guess_type(path,strict=False)
#If we hesitate between html and xml, takes the xml one
#because the FeedRendered fallback to HtmlRenderer
if mime2 and mime != mime2 and "html" in mime and "xml" in mime2:
mime = "text/xml"
# If its a xml file, consider it as such, regardless of what file thinks
elif path.endswith(".xml"):
mime = "text/xml"
#Some xml/html document are considered as octet-stream
if mime == "application/octet-stream":
mime = "text/xml"
else:
mime,encoding = mimetypes.guess_type(path,strict=False)
#gmi Mimetype is not recognized yet
if not mime and not shutil.which("file") :
print("Cannot guess the mime type of the file. Please install \"file\".")
print("(and send me an email, Im curious of systems without \"file\" installed!")
if mime.startswith("text") and mime not in _FORMAT_RENDERERS:
if mime2 and mime2 in _FORMAT_RENDERERS:
mime = mime2
else:
#by default, we consider its gemini except for html
mime = "text/gemini"
return mime
2023-07-21 12:22:09 +00:00
def renderer_from_file(path,url=None):
mime = get_mime(path)
if not url:
url = path
2023-07-21 15:33:55 +00:00
if os.path.exists(path):
if mime.startswith("text/"):
with open(path) as f:
print("DEBUG: opening %s"%path)
content = f.read()
f.close()
else:
content = path
return set_renderer(content,url,mime)
2023-07-21 15:33:55 +00:00
else:
return None
2023-07-21 12:22:09 +00:00
def set_renderer(content,url,mime):
renderer = None
if mime == "Local Folder":
renderer = FolderRenderer("",url,datadir=_DATA_DIR)
return renderer
mime_to_use = []
for m in _FORMAT_RENDERERS:
if fnmatch.fnmatch(mime, m):
mime_to_use.append(m)
if len(mime_to_use) > 0:
current_mime = mime_to_use[0]
func = _FORMAT_RENDERERS[current_mime]
if current_mime.startswith("text"):
renderer = func(content,url)
# We double check if the renderer is correct.
# If not, we fallback to html
# (this is currently only for XHTML, often being
# mislabelled as xml thus RSSfeeds)
if not renderer.is_valid():
func = _FORMAT_RENDERERS["text/html"]
#print("Set (fallback)RENDERER to html instead of %s"%mime)
renderer = func(content,url)
else:
#TODO: check this code and then remove one if.
#we dont parse text, we give the file to the renderer
renderer = func(content,url)
if not renderer.is_valid():
renderer = None
return renderer
def render(input,path=None,format="auto",mime=None,url=None):
if format == "gemtext":
2023-07-21 12:22:09 +00:00
r = GemtextRenderer(input,url)
elif format == "html":
2023-07-21 12:22:09 +00:00
r = HtmlRenderer(input,url)
elif format == "feed":
2023-07-21 12:22:09 +00:00
r = FeedRenderer(input,url)
elif format == "gopher":
2023-07-21 12:22:09 +00:00
r = GopherRenderer(input,url)
elif format == "image":
2023-07-21 12:22:09 +00:00
r = ImageRenderer(input,url)
elif format == "folder":
2023-07-21 12:22:09 +00:00
r = FolderRenderer(input,url)
else:
if not mime and path:
2023-07-21 12:22:09 +00:00
r= renderer_from_file(path,url)
else:
r = set_renderer(input,url,mime)
2023-07-30 21:12:01 +00:00
print("DEBUG: renderer is %s"%r)
if r:
r.display()
2023-07-30 21:12:01 +00:00
else:
print("Could not render %s"%input)
2023-07-18 10:33:30 +00:00
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--format", choices=["auto","gemtext","html","feed","gopher","image","folder"],
help="Renderer to use. Available: auto, gemtext, html, feed, gopher, image, folder")
parser.add_argument("--mime", help="Mime of the content to parse")
2023-07-18 21:43:45 +00:00
## The argument needs to be a path to a file. If none, then stdin is used which allows
## to pipe text directly into ansirenderer
parser.add_argument("--url",metavar="URL", nargs="*",
help="Original URL of the content")
parser.add_argument("content",metavar="INPUT", nargs="*", type=argparse.FileType("r"),
2023-07-18 21:43:45 +00:00
default=sys.stdin, help="Path to the text to render (default to stdin)")
2023-07-18 10:33:30 +00:00
args = parser.parse_args()
# Detect if we are running interactively or in a pipe
if sys.stdin.isatty():
#we are interactive, not in stdin, we can have multiple files as input
for f in args.content:
path = os.path.abspath(f.name)
2023-07-30 21:12:01 +00:00
try:
content = f.read()
except UnicodeDecodeError:
content = f
render(content,path=path,format=args.format,url=args.url,mime=args.mime)
else:
#we are in stdin
if not args.format and not args.mime:
print("Format or mime should be specified when running with stdin")
else:
render(args.content.read(),path=None,format=args.format,url=args.url,mime=args.mime)
2023-07-18 10:33:30 +00:00
if __name__ == '__main__':
main()