adding depth and support for img in html
This commit is contained in:
parent
db991b2fd2
commit
5d19662e3e
|
@ -1,9 +1,10 @@
|
||||||
# Offpunk History
|
# Offpunk History
|
||||||
|
|
||||||
## 0.2.1 or 0.3 - Unreleased
|
## 0.2.1 or 0.3 - Unreleased
|
||||||
|
New Features:
|
||||||
- "less full" allows to see the full html page instead of only the article view
|
- "less full" allows to see the full html page instead of only the article view
|
||||||
Small improvements:
|
- Option --depth to customize your sync. Be warned, more than 1 is crazy.
|
||||||
|
Other Small Improvements:
|
||||||
- Disabled https_everywhere by default (caching problems and some websites not supporting it)
|
- Disabled https_everywhere by default (caching problems and some websites not supporting it)
|
||||||
- Modified --sync logic to make it more intuitive (thanks Bjorn Westergard)
|
- Modified --sync logic to make it more intuitive (thanks Bjorn Westergard)
|
||||||
- Caching more problems to avoid refetch
|
- Caching more problems to avoid refetch
|
50
offpunk.py
50
offpunk.py
|
@ -335,6 +335,13 @@ class HtmlRenderer():
|
||||||
# This method recursively parse the HTML
|
# This method recursively parse the HTML
|
||||||
r_body = ""
|
r_body = ""
|
||||||
links = []
|
links = []
|
||||||
|
def sanitize_string(string):
|
||||||
|
toreturn = string.replace("\n", " ").replace("\t"," ")
|
||||||
|
while " " in toreturn:
|
||||||
|
toreturn = toreturn.replace(" "," ")
|
||||||
|
toreturn = toreturn.strip("\n").strip("\t").strip()
|
||||||
|
toreturn = toreturn.replace(" "," ")
|
||||||
|
return toreturn
|
||||||
def recursive_render(element,indent=""):
|
def recursive_render(element,indent=""):
|
||||||
rendered_body = ""
|
rendered_body = ""
|
||||||
#print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
|
#print("rendering %s - %s with indent %s" %(element.name,element.string,indent))
|
||||||
|
@ -374,7 +381,7 @@ class HtmlRenderer():
|
||||||
temp_str += recursive_render(child,indent=indent)
|
temp_str += recursive_render(child,indent=indent)
|
||||||
rendered_body = temp_str + "\n\n"
|
rendered_body = temp_str + "\n\n"
|
||||||
elif element.name == "a":
|
elif element.name == "a":
|
||||||
text = element.get_text().strip()
|
text = sanitize_string(element.get_text())
|
||||||
link = element.get('href')
|
link = element.get('href')
|
||||||
if link:
|
if link:
|
||||||
links.append(link+" "+text)
|
links.append(link+" "+text)
|
||||||
|
@ -383,13 +390,25 @@ class HtmlRenderer():
|
||||||
else:
|
else:
|
||||||
#No real link found
|
#No real link found
|
||||||
rendered_body = text
|
rendered_body = text
|
||||||
|
elif element.name == "img":
|
||||||
|
alt = element.get("alt")
|
||||||
|
if alt:
|
||||||
|
alt = sanitize_string(alt)
|
||||||
|
text = "[IMG] %s"%alt
|
||||||
|
else:
|
||||||
|
text = "[IMG]"
|
||||||
|
src = element.get("src")
|
||||||
|
if src:
|
||||||
|
links.append(src+" "+text)
|
||||||
|
link_id = " [%s]"%(len(links))
|
||||||
|
rendered_body = "\x1b[33m\x1b[2m " + text + link_id + "\x1b[0m\n"
|
||||||
elif element.name == "br":
|
elif element.name == "br":
|
||||||
rendered_body = "\n"
|
rendered_body = "\n"
|
||||||
elif element.string:
|
elif element.string:
|
||||||
#print("tag without children:",element.name)
|
#print("tag without children:",element.name)
|
||||||
#print("string : **%s** "%element.string.strip())
|
#print("string : **%s** "%element.string.strip())
|
||||||
#print("########")
|
#print("########")
|
||||||
rendered_body = element.string.strip("\n").strip("\t")
|
rendered_body = sanitize_string(element.string)
|
||||||
else:
|
else:
|
||||||
#print("tag children:",element.name)
|
#print("tag children:",element.name)
|
||||||
for child in element.children:
|
for child in element.children:
|
||||||
|
@ -435,8 +454,10 @@ class HtmlRenderer():
|
||||||
first_line = ""
|
first_line = ""
|
||||||
while first_line == "" and len(lines) > 0:
|
while first_line == "" and len(lines) > 0:
|
||||||
first_line = lines.pop(0)
|
first_line = lines.pop(0)
|
||||||
if self.get_title() not in first_line:
|
if self.get_title()[:79] not in first_line:
|
||||||
r_body = "\x1b[1;34m\x1b[4m" + self.get_title() + "\x1b[0m""\n" + r_body
|
title = "\x1b[1;34m\x1b[4m" + self.get_title() + "\x1b[0m""\n"
|
||||||
|
title = textwrap.fill(title,80)
|
||||||
|
r_body = title + "\n" + r_body
|
||||||
return r_body,links
|
return r_body,links
|
||||||
|
|
||||||
# Mapping mimetypes with renderers
|
# Mapping mimetypes with renderers
|
||||||
|
@ -2530,6 +2551,8 @@ def main():
|
||||||
help='assume-yes when asked questions about certificates/redirections during sync')
|
help='assume-yes when asked questions about certificates/redirections during sync')
|
||||||
parser.add_argument('--fetch-later', action='store_true',
|
parser.add_argument('--fetch-later', action='store_true',
|
||||||
help='run non-interactively with an URL as argument to fetch it later')
|
help='run non-interactively with an URL as argument to fetch it later')
|
||||||
|
parser.add_argument('--depth',
|
||||||
|
help='depth of the cache to build. Default is 1. More is crazy. Use at your own risks!')
|
||||||
parser.add_argument('--cache-validity',
|
parser.add_argument('--cache-validity',
|
||||||
help='duration for which a cache is valid before sync (seconds)')
|
help='duration for which a cache is valid before sync (seconds)')
|
||||||
parser.add_argument('--version', action='store_true',
|
parser.add_argument('--version', action='store_true',
|
||||||
|
@ -2635,9 +2658,12 @@ def main():
|
||||||
add_to_tour(gitem)
|
add_to_tour(gitem)
|
||||||
#Now, recursive call, even if we didn’t refresh the cache
|
#Now, recursive call, even if we didn’t refresh the cache
|
||||||
if depth > 0:
|
if depth > 0:
|
||||||
d = depth - 1
|
#we only savetotour at the first level of recursion
|
||||||
|
if depth > 1:
|
||||||
|
savetotour=False
|
||||||
links = gitem.get_links()
|
links = gitem.get_links()
|
||||||
subcount = [0,len(links)]
|
subcount = [0,len(links)]
|
||||||
|
d = depth - 1
|
||||||
for k in links:
|
for k in links:
|
||||||
#recursive call (validity is always 0 in recursion)
|
#recursive call (validity is always 0 in recursion)
|
||||||
substri = strin + " -->"
|
substri = strin + " -->"
|
||||||
|
@ -2645,14 +2671,14 @@ def main():
|
||||||
fetch_gitem(k,depth=d,validity=0,savetotour=savetotour,\
|
fetch_gitem(k,depth=d,validity=0,savetotour=savetotour,\
|
||||||
count=subcount,strin=substri)
|
count=subcount,strin=substri)
|
||||||
|
|
||||||
def fetch_list(list,validity=0,tourandremove=False,tourchildren=False):
|
def fetch_list(list,validity=0,depth=1,tourandremove=False,tourchildren=False):
|
||||||
links = gc.list_get_links(list)
|
links = gc.list_get_links(list)
|
||||||
end = len(links)
|
end = len(links)
|
||||||
counter = 0
|
counter = 0
|
||||||
print(" * * * %s to fetch in %s * * *" %(end,list))
|
print(" * * * %s to fetch in %s * * *" %(end,list))
|
||||||
for l in links:
|
for l in links:
|
||||||
counter += 1
|
counter += 1
|
||||||
fetch_gitem(l,depth=1,validity=validity,savetotour=tourchildren,count=[counter,end])
|
fetch_gitem(l,depth=depth,validity=validity,savetotour=tourchildren,count=[counter,end])
|
||||||
if tourandremove:
|
if tourandremove:
|
||||||
add_to_tour(l)
|
add_to_tour(l)
|
||||||
gc.list_rm_url(l.url,list)
|
gc.list_rm_url(l.url,list)
|
||||||
|
@ -2662,6 +2688,10 @@ def main():
|
||||||
else:
|
else:
|
||||||
# if no refresh time, a default of 0 is used (which means "infinite")
|
# if no refresh time, a default of 0 is used (which means "infinite")
|
||||||
refresh_time = 0
|
refresh_time = 0
|
||||||
|
if args.depth:
|
||||||
|
depth = int(args.depth)
|
||||||
|
else:
|
||||||
|
depth = 1
|
||||||
gc.sync_only = True
|
gc.sync_only = True
|
||||||
lists = gc.list_lists()
|
lists = gc.list_lists()
|
||||||
# We will fetch all the lists except "archives" and "history"
|
# We will fetch all the lists except "archives" and "history"
|
||||||
|
@ -2672,14 +2702,14 @@ def main():
|
||||||
# We start with the "subscribed" as we need to find new items
|
# We start with the "subscribed" as we need to find new items
|
||||||
if "subscribed" in lists:
|
if "subscribed" in lists:
|
||||||
lists.remove("subscribed")
|
lists.remove("subscribed")
|
||||||
fetch_list("subscribed",validity=refresh_time,tourchildren=True)
|
fetch_list("subscribed",validity=refresh_time,depth=depth,tourchildren=True)
|
||||||
#Then the fetch list (item are removed from the list after fetch)
|
#Then the fetch list (item are removed from the list after fetch)
|
||||||
if "to_fetch" in lists:
|
if "to_fetch" in lists:
|
||||||
lists.remove("to_fetch")
|
lists.remove("to_fetch")
|
||||||
fetch_list("to_fetch",validity=refresh_time,tourandremove=True)
|
fetch_list("to_fetch",validity=refresh_time,depth=depth,tourandremove=True)
|
||||||
#then we fetch all the rest (including bookmarks and tour)
|
#then we fetch all the rest (including bookmarks and tour)
|
||||||
for l in lists:
|
for l in lists:
|
||||||
fetch_list(l,validity=refresh_time)
|
fetch_list(l,validity=refresh_time,depth=depth)
|
||||||
|
|
||||||
gc.onecmd("blackbox")
|
gc.onecmd("blackbox")
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue