adding depth and support for img in html

This commit is contained in:
Lionel Dricot 2022-02-04 01:14:22 +01:00
parent db991b2fd2
commit 5d19662e3e
2 changed files with 43 additions and 12 deletions

View File

@ -1,9 +1,10 @@
# Offpunk History # Offpunk History
## 0.2.1 or 0.3 - Unreleased ## 0.2.1 or 0.3 - Unreleased
New Features:
- "less full" allows to see the full html page instead of only the article view - "less full" allows to see the full html page instead of only the article view
Small improvements: - Option --depth to customize your sync. Be warned, more than 1 is crazy.
Other Small Improvements:
- Disabled https_everywhere by default (caching problems and some websites not supporting it) - Disabled https_everywhere by default (caching problems and some websites not supporting it)
- Modified --sync logic to make it more intuitive (thanks Bjorn Westergard) - Modified --sync logic to make it more intuitive (thanks Bjorn Westergard)
- Caching more problems to avoid refetch - Caching more problems to avoid refetch

View File

@ -335,6 +335,13 @@ class HtmlRenderer():
# This method recursively parse the HTML # This method recursively parse the HTML
r_body = "" r_body = ""
links = [] links = []
def sanitize_string(string):
toreturn = string.replace("\n", " ").replace("\t"," ")
while " " in toreturn:
toreturn = toreturn.replace(" "," ")
toreturn = toreturn.strip("\n").strip("\t").strip()
toreturn = toreturn.replace("&nbsp"," ")
return toreturn
def recursive_render(element,indent=""): def recursive_render(element,indent=""):
rendered_body = "" rendered_body = ""
#print("rendering %s - %s with indent %s" %(element.name,element.string,indent)) #print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
@ -374,7 +381,7 @@ class HtmlRenderer():
temp_str += recursive_render(child,indent=indent) temp_str += recursive_render(child,indent=indent)
rendered_body = temp_str + "\n\n" rendered_body = temp_str + "\n\n"
elif element.name == "a": elif element.name == "a":
text = element.get_text().strip() text = sanitize_string(element.get_text())
link = element.get('href') link = element.get('href')
if link: if link:
links.append(link+" "+text) links.append(link+" "+text)
@ -383,13 +390,25 @@ class HtmlRenderer():
else: else:
#No real link found #No real link found
rendered_body = text rendered_body = text
elif element.name == "img":
alt = element.get("alt")
if alt:
alt = sanitize_string(alt)
text = "[IMG] %s"%alt
else:
text = "[IMG]"
src = element.get("src")
if src:
links.append(src+" "+text)
link_id = " [%s]"%(len(links))
rendered_body = "\x1b[33m\x1b[2m " + text + link_id + "\x1b[0m\n"
elif element.name == "br": elif element.name == "br":
rendered_body = "\n" rendered_body = "\n"
elif element.string: elif element.string:
#print("tag without children:",element.name) #print("tag without children:",element.name)
#print("string : **%s** "%element.string.strip()) #print("string : **%s** "%element.string.strip())
#print("########") #print("########")
rendered_body = element.string.strip("\n").strip("\t") rendered_body = sanitize_string(element.string)
else: else:
#print("tag children:",element.name) #print("tag children:",element.name)
for child in element.children: for child in element.children:
@ -435,8 +454,10 @@ class HtmlRenderer():
first_line = "" first_line = ""
while first_line == "" and len(lines) > 0: while first_line == "" and len(lines) > 0:
first_line = lines.pop(0) first_line = lines.pop(0)
if self.get_title() not in first_line: if self.get_title()[:79] not in first_line:
r_body = "\x1b[1;34m\x1b[4m" + self.get_title() + "\x1b[0m""\n" + r_body title = "\x1b[1;34m\x1b[4m" + self.get_title() + "\x1b[0m""\n"
title = textwrap.fill(title,80)
r_body = title + "\n" + r_body
return r_body,links return r_body,links
# Mapping mimetypes with renderers # Mapping mimetypes with renderers
@ -2530,6 +2551,8 @@ def main():
help='assume-yes when asked questions about certificates/redirections during sync') help='assume-yes when asked questions about certificates/redirections during sync')
parser.add_argument('--fetch-later', action='store_true', parser.add_argument('--fetch-later', action='store_true',
help='run non-interactively with an URL as argument to fetch it later') help='run non-interactively with an URL as argument to fetch it later')
parser.add_argument('--depth',
help='depth of the cache to build. Default is 1. More is crazy. Use at your own risks!')
parser.add_argument('--cache-validity', parser.add_argument('--cache-validity',
help='duration for which a cache is valid before sync (seconds)') help='duration for which a cache is valid before sync (seconds)')
parser.add_argument('--version', action='store_true', parser.add_argument('--version', action='store_true',
@ -2635,9 +2658,12 @@ def main():
add_to_tour(gitem) add_to_tour(gitem)
#Now, recursive call, even if we didnt refresh the cache #Now, recursive call, even if we didnt refresh the cache
if depth > 0: if depth > 0:
d = depth - 1 #we only savetotour at the first level of recursion
if depth > 1:
savetotour=False
links = gitem.get_links() links = gitem.get_links()
subcount = [0,len(links)] subcount = [0,len(links)]
d = depth - 1
for k in links: for k in links:
#recursive call (validity is always 0 in recursion) #recursive call (validity is always 0 in recursion)
substri = strin + " -->" substri = strin + " -->"
@ -2645,14 +2671,14 @@ def main():
fetch_gitem(k,depth=d,validity=0,savetotour=savetotour,\ fetch_gitem(k,depth=d,validity=0,savetotour=savetotour,\
count=subcount,strin=substri) count=subcount,strin=substri)
def fetch_list(list,validity=0,tourandremove=False,tourchildren=False): def fetch_list(list,validity=0,depth=1,tourandremove=False,tourchildren=False):
links = gc.list_get_links(list) links = gc.list_get_links(list)
end = len(links) end = len(links)
counter = 0 counter = 0
print(" * * * %s to fetch in %s * * *" %(end,list)) print(" * * * %s to fetch in %s * * *" %(end,list))
for l in links: for l in links:
counter += 1 counter += 1
fetch_gitem(l,depth=1,validity=validity,savetotour=tourchildren,count=[counter,end]) fetch_gitem(l,depth=depth,validity=validity,savetotour=tourchildren,count=[counter,end])
if tourandremove: if tourandremove:
add_to_tour(l) add_to_tour(l)
gc.list_rm_url(l.url,list) gc.list_rm_url(l.url,list)
@ -2662,6 +2688,10 @@ def main():
else: else:
# if no refresh time, a default of 0 is used (which means "infinite") # if no refresh time, a default of 0 is used (which means "infinite")
refresh_time = 0 refresh_time = 0
if args.depth:
depth = int(args.depth)
else:
depth = 1
gc.sync_only = True gc.sync_only = True
lists = gc.list_lists() lists = gc.list_lists()
# We will fetch all the lists except "archives" and "history" # We will fetch all the lists except "archives" and "history"
@ -2672,14 +2702,14 @@ def main():
# We start with the "subscribed" as we need to find new items # We start with the "subscribed" as we need to find new items
if "subscribed" in lists: if "subscribed" in lists:
lists.remove("subscribed") lists.remove("subscribed")
fetch_list("subscribed",validity=refresh_time,tourchildren=True) fetch_list("subscribed",validity=refresh_time,depth=depth,tourchildren=True)
#Then the fetch list (item are removed from the list after fetch) #Then the fetch list (item are removed from the list after fetch)
if "to_fetch" in lists: if "to_fetch" in lists:
lists.remove("to_fetch") lists.remove("to_fetch")
fetch_list("to_fetch",validity=refresh_time,tourandremove=True) fetch_list("to_fetch",validity=refresh_time,depth=depth,tourandremove=True)
#then we fetch all the rest (including bookmarks and tour) #then we fetch all the rest (including bookmarks and tour)
for l in lists: for l in lists:
fetch_list(l,validity=refresh_time) fetch_list(l,validity=refresh_time,depth=depth)
gc.onecmd("blackbox") gc.onecmd("blackbox")
else: else: