From ef808afe1dd98b2fe401f03856c909f46a4ff7af Mon Sep 17 00:00:00 2001 From: Lionel Dricot Date: Sat, 8 Jul 2023 00:43:56 +0200 Subject: [PATCH] netcache now downloads http --- netcache.py | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++-- offpunk.py | 108 ------------------------------------ 2 files changed, 150 insertions(+), 114 deletions(-) mode change 100644 => 100755 netcache.py diff --git a/netcache.py b/netcache.py old mode 100644 new mode 100755 index 96b65c8..19b008c --- a/netcache.py +++ b/netcache.py @@ -1,19 +1,51 @@ #!/bin/python import os -import urllib +import urllib.parse import argparse +import requests _home = os.path.expanduser('~') cache_home = os.environ.get('XDG_CACHE_HOME') or\ os.path.join(_home,'.cache') -_CACHE_PATH = os.path.join(cache_home,"offpunk/") +#_CACHE_PATH = os.path.join(cache_home,"offpunk/") +#Debug: +_CACHE_PATH = "/home/ploum/dev/netcache/" if not os.path.exists(_CACHE_PATH): print("Creating cache directory {}".format(_CACHE_PATH)) os.makedirs(_CACHE_PATH) +# This list is also used as a list of supported protocols +standard_ports = { + "gemini" : 1965, + "gopher" : 70, + "finger" : 79, + "http" : 80, + "https" : 443, + "spartan": 300, +} +default_protocol = "gemini" + +def parse_mime(mime): + options = {} + if mime: + if ";" in mime: + splited = mime.split(";",maxsplit=1) + mime = splited[0] + if len(splited) >= 1: + options_list = splited[1].split() + for o in options_list: + spl = o.split("=",maxsplit=1) + if len(spl) > 0: + options[spl[0]] = spl[1] + return mime, options + +def normalize_url(url): + if "://" not in url and ("./" not in url and url[0] != "/"): + if not url.startswith("mailto:"): + url = "gemini://" + url + return url -#def get(url,max_size_download=None,timeout=None): def cache_last_modified(url): path = get_cache_path(url) @@ -62,8 +94,10 @@ def get_cache_path(url): parsed = urllib.parse.urlparse(url) if url[0] == "/" or url.startswith("./"): scheme = "file" - else: + elif parsed.scheme: scheme = parsed.scheme + else: + scheme = default_protocol if scheme in ["file","mailto","list"]: local = True host = "" @@ -151,6 +185,113 @@ def get_cache_path(url): cache_path += "/" + index return cache_path +def write_body(url,body,mime=None): + ## body is a copy of the raw gemtext + ## Write_body() also create the cache ! + # DEFAULT GEMINI MIME + mime, options = parse_mime(mime) + cache_path = get_cache_path(url) + if cache_path: + if mime and mime.startswith("text/"): + mode = "w" + else: + mode = "wb" + cache_dir = os.path.dirname(cache_path) + # If the subdirectory already exists as a file (not a folder) + # We remove it (happens when accessing URL/subfolder before + # URL/subfolder/file.gmi. + # This causes loss of data in the cache + # proper solution would be to save "sufolder" as "sufolder/index.gmi" + # If the subdirectory doesn’t exist, we recursively try to find one + # until it exists to avoid a file blocking the creation of folders + root_dir = cache_dir + while not os.path.exists(root_dir): + root_dir = os.path.dirname(root_dir) + if os.path.isfile(root_dir): + os.remove(root_dir) + os.makedirs(cache_dir,exist_ok=True) + with open(cache_path, mode=mode) as f: + f.write(body) + f.close() + return cache_path + +def _fetch_http(url,max_length=None): + def set_error(item,length,max_length): + err = "Size of %s is %s Mo\n"%(item.url,length) + err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000) + err += "To retrieve this content anyway, type 'reload'." + item.set_error(err) + return item + header = {} + header["User-Agent"] = "Netcache" + parsed = urllib.parse.urlparse(url) + # Code to translate URLs to better frontends (think twitter.com -> nitter) + #if options["redirects"]: + # netloc = parsed.netloc + # if netloc.startswith("www."): +# netloc = netloc[4:] +# if netloc in self.redirects: +# if self.redirects[netloc] == "blocked": +# text = "This website has been blocked.\n" +# text += "Use the redirect command to unblock it." +# gi.write_body(text,"text/gemini") +# return gi +# else: +# parsed = parsed._replace(netloc = self.redirects[netloc]) + url = urllib.parse.urlunparse(parsed) + with requests.get(url,headers=header, stream=True,timeout=5) as response: + #print("This is header for %s"%gi.url) + #print(response.headers) + if "content-type" in response.headers: + mime = response.headers['content-type'] + else: + mime = None + if "content-length" in response.headers: + length = int(response.headers['content-length']) + else: + length = 0 + if max_length and length > max_length: + response.close() + return set_error(gi,str(length/1000000),max_length) + elif max_length and length == 0: + body = b'' + downloaded = 0 + for r in response.iter_content(): + body += r + #We divide max_size for streamed content + #in order to catch them faster + size = sys.getsizeof(body) + max = max_length/2 + current = round(size*100/max,1) + if current > downloaded: + downloaded = current + print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r') + #print("size: %s (%s\% of maxlenght)"%(size,size/max_length)) + if size > max_length/2: + response.close() + return set_error(gi,"streaming",max_length) + response.close() + else: + body = response.content + response.close() + if mime and "text/" in mime: + body = body.decode("UTF-8","replace") + cache = write_body(url,body,mime) + return cache + + +def fetch(url): + url = normalize_url(url) + if "://" in url: + scheme = url.split("://")[0] + if scheme in ("http","https"): + path=_fetch_http(url) + print("Path = %s"%path) + else: + print("scheme %s not implemented yet") + else: + print("Not a supproted URL") + def main(): @@ -165,9 +306,12 @@ def main(): # --offline : do not attempt to download, return Null if no cached version # --validity : returns the date of the cached version, Null if no version # --force-download : download and replace cache, even if valid + # --max-size-download : cancel download of items above that size. Returns Null. args = parser.parse_args() - - print("Download URL: %s" %args.url) + + for u in args.url: + print("Download URL: %s" %u) + fetch(u) diff --git a/offpunk.py b/offpunk.py index a166669..87353ba 100755 --- a/offpunk.py +++ b/offpunk.py @@ -56,19 +56,6 @@ try: except ModuleNotFoundError: _HAS_SETPROCTITLE = False -def parse_mime(mime): - options = {} - if mime: - if ";" in mime: - splited = mime.split(";",maxsplit=1) - mime = splited[0] - if len(splited) >= 1: - options_list = splited[1].split() - for o in options_list: - spl = o.split("=",maxsplit=1) - if len(spl) > 0: - options[spl[0]] = spl[1] - return mime, options _HAS_XSEL = shutil.which('xsel') _HAS_XDGOPEN = shutil.which('xdg-open') @@ -213,9 +200,6 @@ _FORMAT_RENDERERS = { class GeminiItem(): def __init__(self, url, name=""): - if "://" not in url and ("./" not in url and url[0] != "/"): - if not url.startswith("mailto:"): - url = "gemini://" + url self.last_mode = None findmode = url.split("##offpunk_mode=") if len(findmode) > 1: @@ -447,35 +431,6 @@ class GeminiItem(): tmpf = cache_path return tmpf - def write_body(self,body,mime=None): - ## body is a copy of the raw gemtext - ## Write_body() also create the cache ! - # DEFAULT GEMINI MIME - self.body = body - self.mime, options = parse_mime(mime) - cache_path = self.get_cache_path() - if not self.local and cache_path: - if self.mime and self.mime.startswith("text/"): - mode = "w" - else: - mode = "wb" - cache_dir = os.path.dirname(self.get_cache_path()) - # If the subdirectory already exists as a file (not a folder) - # We remove it (happens when accessing URL/subfolder before - # URL/subfolder/file.gmi. - # This causes loss of data in the cache - # proper solution would be to save "sufolder" as "sufolder/index.gmi" - # If the subdirectory doesn’t exist, we recursively try to find one - # until it exists to avoid a file blocking the creation of folders - root_dir = cache_dir - while not os.path.exists(root_dir): - root_dir = os.path.dirname(root_dir) - if os.path.isfile(root_dir): - os.remove(root_dir) - os.makedirs(cache_dir,exist_ok=True) - with open(self.get_cache_path(), mode=mode) as f: - f.write(body) - f.close() def get_mime(self): #Beware, this one is really a shaddy ad-hoc function @@ -931,69 +886,6 @@ class GeminiClient(cmd.Cmd): print("Handler program %s not found!" % shlex.split(cmd_str)[0]) print("You can use the ! command to specify another handler program or pipeline.") - def _fetch_http(self,gi,max_length=None): - def set_error(item,length,max_length): - err = "Size of %s is %s Mo\n"%(item.url,length) - err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000) - err += "To retrieve this content anyway, type 'reload'." - item.set_error(err) - return item - header = {} - header["User-Agent"] = "Offpunk browser v%s"%__version__ - parsed = urllib.parse.urlparse(gi.url) - # Code to translate URLs to better frontends (think twitter.com -> nitter) - if self.options["redirects"]: - netloc = parsed.netloc - if netloc.startswith("www."): - netloc = netloc[4:] - if netloc in self.redirects: - if self.redirects[netloc] == "blocked": - text = "This website has been blocked.\n" - text += "Use the redirect command to unblock it." - gi.write_body(text,"text/gemini") - return gi - else: - parsed = parsed._replace(netloc = self.redirects[netloc]) - url = urllib.parse.urlunparse(parsed) - with requests.get(url,headers=header, stream=True,timeout=5) as response: - #print("This is header for %s"%gi.url) - #print(response.headers) - if "content-type" in response.headers: - mime = response.headers['content-type'] - else: - mime = None - if "content-length" in response.headers: - length = int(response.headers['content-length']) - else: - length = 0 - if max_length and length > max_length: - response.close() - return set_error(gi,str(length/1000000),max_length) - elif max_length and length == 0: - body = b'' - downloaded = 0 - for r in response.iter_content(): - body += r - #We divide max_size for streamed content - #in order to catch them faster - size = sys.getsizeof(body) - max = max_length/2 - current = round(size*100/max,1) - if current > downloaded: - downloaded = current - print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r') - #print("size: %s (%s\% of maxlenght)"%(size,size/max_length)) - if size > max_length/2: - response.close() - return set_error(gi,"streaming",max_length) - response.close() - else: - body = response.content - response.close() - if mime and "text/" in mime: - body = body.decode("UTF-8","replace") - gi.write_body(body,mime) - return gi def _fetch_gopher(self,gi,timeout=10): if not looks_like_url(gi.url):