NewHTML: we now switched to the new HTML rendering engine

This commit is contained in:
Lionel Dricot 2022-03-21 11:57:35 +01:00
parent 0499527da5
commit d2a0b38e5e
1 changed files with 21 additions and 88 deletions

View File

@ -1020,7 +1020,6 @@ class HtmlRenderer(AbstractRenderer):
print("HTML document detected. Please install python-bs4 and python-readability.") print("HTML document detected. Please install python-bs4 and python-readability.")
return return
# This method recursively parse the HTML # This method recursively parse the HTML
r_body = ""
r = self.representation(width,title=self.get_title()) r = self.representation(width,title=self.get_title())
links = [] links = []
# You know how bad html is when you realize that space sometimes meaningful, somtimes not. # You know how bad html is when you realize that space sometimes meaningful, somtimes not.
@ -1059,92 +1058,64 @@ class HtmlRenderer(AbstractRenderer):
toreturn = " " + toreturn toreturn = " " + toreturn
return toreturn return toreturn
def recursive_render(element,indent="",preformatted=False): def recursive_render(element,indent="",preformatted=False):
rendered_body = ""
if element.name == "blockquote": if element.name == "blockquote":
r.newparagraph() r.newparagraph()
r.startindent(" ",reverse=" ") r.startindent(" ",reverse=" ")
for child in element.children: for child in element.children:
rendered_body += "\x1b[3m"
r.open_color("italic") r.open_color("italic")
rendered_body += recursive_render(child,indent="\t").rstrip("\t") recursive_render(child,indent="\t")
rendered_body += "\x1b[23m"
r.close_color("italic") r.close_color("italic")
r.endindent() r.endindent()
elif element.name in ["div","p"]: elif element.name in ["div","p"]:
rendered_body += "\n"
r.newparagraph() r.newparagraph()
div = ""
for child in element.children: for child in element.children:
div += recursive_render(child,indent=indent) recursive_render(child,indent=indent)
rendered_body += div
rendered_body += "\n\n"
elif element.name in ["h1","h2","h3","h4","h5","h6"]: elif element.name in ["h1","h2","h3","h4","h5","h6"]:
if element.name in ["h1","h2"]: r.open_color("blue")
title_tag = "\x1b[1;34m\x1b[4m" if element.name in ["h1"]:
r.open_color("bold") r.open_color("bold")
r.open_color("blue")
r.open_color("underline") r.open_color("underline")
elif element.name in ["h3","h4"]: elif element.name in ["h2"]:
title_tag = "\x1b[34m" r.open_color("bold")
r.open_color("blue") elif element.name in ["h5","h6"]:
else:
title_tag = "\x1b[34m\x1b[2m"
r.open_color("blue")
r.open_color("faint") r.open_color("faint")
for child in element.children: for child in element.children:
r.newparagraph() r.newparagraph()
rendered_body += "\n" + title_tag + recursive_render(child) + "\x1b[0m" + "\n" recursive_render(child)
r.close_all() r.close_all()
elif element.name in ["code","tt"]: elif element.name in ["code","tt"]:
rendered_body += "\n"
for child in element.children: for child in element.children:
rendered_body += recursive_render(child,indent=indent,preformatted=True) recursive_render(child,indent=indent,preformatted=True)
rendered_body += "\n\n"
elif element.name in ["pre"]: elif element.name in ["pre"]:
rendered_body += "\n"
rendered_body += element.text
r.add_block(element.text) r.add_block(element.text)
rendered_body += "\n\n"
elif element.name in ["li"]: elif element.name in ["li"]:
line = ""
r.startindent("",sub=" ") r.startindent("",sub=" ")
for child in element.children: for child in element.children:
line += recursive_render(child,indent=indent).strip("\n") recursive_render(child,indent=indent)
rendered_body += " * " + line.strip() + "\n"
r.endindent() r.endindent()
elif element.name in ["tr"]: elif element.name in ["tr"]:
line = ""
r.startindent("|",reverse="|") r.startindent("|",reverse="|")
for child in element.children: for child in element.children:
line += recursive_render(child,indent=indent).strip("\n") recursive_render(child,indent=indent)
rendered_body += " " + line.strip() + "\n"
r.endindent() r.endindent()
elif element.name in ["td","th"]: elif element.name in ["td","th"]:
line = "| "
r.add_text("| ") r.add_text("| ")
for child in element.children: for child in element.children:
line += recursive_render(child) recursive_render(child)
line += " |"
r.add_text(" |") r.add_text(" |")
rendered_body += line
# italics # italics
elif element.name in ["em","i"]: elif element.name in ["em","i"]:
rendered_body += "\x1b[3m"
r.open_color("italic") r.open_color("italic")
for child in element.children: for child in element.children:
rendered_body += recursive_render(child,indent=indent,preformatted=preformatted) recursive_render(child,indent=indent,preformatted=preformatted)
rendered_body += "\x1b[23m"
r.close_color("italic") r.close_color("italic")
#bold #bold
elif element.name in ["b","strong"]: elif element.name in ["b","strong"]:
rendered_body += "\x1b[1m"
r.open_color("bold") r.open_color("bold")
for child in element.children: for child in element.children:
rendered_body += recursive_render(child,indent=indent,preformatted=preformatted) recursive_render(child,indent=indent,preformatted=preformatted)
rendered_body += "\x1b[22m"
r.close_color("bold") r.close_color("bold")
elif element.name == "a": elif element.name == "a":
text = ""
link = element.get('href') link = element.get('href')
# support for images nested in links # support for images nested in links
if link: if link:
@ -1153,8 +1124,7 @@ class HtmlRenderer(AbstractRenderer):
#we display images first in a link #we display images first in a link
for child in element.children: for child in element.children:
if child.name == "img": if child.name == "img":
# recursive rendering seems to display some images twice recursive_render(child)
rendered_body += recursive_render(child)
imgtext = "[IMG LINK %s]" imgtext = "[IMG LINK %s]"
links.append(link+" "+text) links.append(link+" "+text)
link_id = str(len(links)) link_id = str(len(links))
@ -1162,21 +1132,18 @@ class HtmlRenderer(AbstractRenderer):
r.open_color("faint") r.open_color("faint")
for child in element.children: for child in element.children:
if child.name != "img": if child.name != "img":
text += recursive_render(child,preformatted=preformatted) recursive_render(child,preformatted=preformatted)
if text == "" and imgtext != "": if imgtext != "":
text = imgtext%link_id
r.center_line() r.center_line()
r.add_text(imgtext%link_id) r.add_text(imgtext%link_id)
else: else:
r.add_text(" [%s]"%link_id) r.add_text(" [%s]"%link_id)
text += " [%s]"%link_id
rendered_body += "\x1b[2;34m" + text + "\x1b[0m"
r.close_color("blue") r.close_color("blue")
r.close_color("faint") r.close_color("faint")
else: else:
#No real link found #No real link found
for child in element.children: for child in element.children:
rendered_body += recursive_render(child,preformatted=preformatted) recursive_render(child,preformatted=preformatted)
elif element.name == "img": elif element.name == "img":
src = element.get("src") src = element.get("src")
text = "" text = ""
@ -1190,36 +1157,29 @@ class HtmlRenderer(AbstractRenderer):
if src: if src:
links.append(src+" "+text) links.append(src+" "+text)
link_id = " [%s]"%(len(links)) link_id = " [%s]"%(len(links))
alttext = text + link_id
alttext2 = alttext.center(term_width())
r.add_block(ansi_img) r.add_block(ansi_img)
r.open_color("faint") r.open_color("faint")
r.open_color("yellow") r.open_color("yellow")
r.center_line() r.center_line()
rendered_body = ansi_img + "\x1b[2;33m" + alttext2 + "\x1b[0m\n\n" r.add_text(text + link_id)
r.add_text(alttext)
r.close_color("faint") r.close_color("faint")
r.close_color("yellow") r.close_color("yellow")
r.newline() r.newline()
elif element.name == "br": elif element.name == "br":
rendered_body = "\n"
r.newline() r.newline()
elif element.name not in ["script","style","template"] and type(element) != Comment: elif element.name not in ["script","style","template"] and type(element) != Comment:
if element.string: if element.string:
if preformatted : if preformatted :
rendered_body = element.string
r.open_color("faint") r.open_color("faint")
r.add_text(element.string) r.add_text(element.string)
r.close_color("faint") r.close_color("faint")
else: else:
s = sanitize_string(element.string) s = sanitize_string(element.string)
rendered_body = s
if len(s.strip()) > 0: if len(s.strip()) > 0:
r.add_text(s) r.add_text(s)
else: else:
for child in element.children: for child in element.children:
rendered_body += recursive_render(child,indent=indent) recursive_render(child,indent=indent)
return indent + rendered_body
# the real render_html hearth # the real render_html hearth
if mode == "full": if mode == "full":
summary = body summary = body
@ -1228,40 +1188,13 @@ class HtmlRenderer(AbstractRenderer):
summary = readable.summary() summary = readable.summary()
soup = BeautifulSoup(summary, 'html.parser') soup = BeautifulSoup(summary, 'html.parser')
#soup = BeautifulSoup(summary, 'html5lib') #soup = BeautifulSoup(summary, 'html5lib')
rendered_body = ""
if soup : if soup :
if soup.body : if soup.body :
contents = soup.body.contents contents = soup.body.contents
else: else:
contents = soup.contents contents = soup.contents
for el in contents: for el in contents:
rendered_body += recursive_render(el) recursive_render(el)
paragraphs = rendered_body.split("\n\n")
for par in paragraphs:
lines = par.splitlines()
for line in lines:
if line.startswith("\t"):
i_indent = " "
s_indent = i_indent
line = line.strip("\t")
elif line.lstrip().startswith("* "):
line = line.lstrip()
i_indent = " " # we keep the initial bullet)
s_indent = " "
else:
i_indent = ""
s_indent = i_indent
if line.strip() != "":
try:
wrapped = wrapparagraph(line,width,initial_indent=i_indent,
subsequent_indent=s_indent,center=self.center)
except Exception as err:
wrapped = line
wrapped += "\n"
else:
wrapped = ""
r_body += wrapped
r_body += "\n"
return r.get_final(),links return r.get_final(),links
# Mapping mimetypes with renderers # Mapping mimetypes with renderers