netcache now downloads http

2023-07-08 00:43:56 +02:00 · 2023-07-08 00:43:56 +02:00 · ef808afe1d
parent 3c760c914a
commit ef808afe1d
2 changed files with 150 additions and 114 deletions
--- a/netcache.py
+++ b/netcache.py
@ -1,19 +1,51 @@
 #!/bin/python
 import os
-import urllib
+import urllib.parse
 import argparse
+import requests

 _home = os.path.expanduser('~')
 cache_home = os.environ.get('XDG_CACHE_HOME') or\
                os.path.join(_home,'.cache')
-_CACHE_PATH = os.path.join(cache_home,"offpunk/")
+#_CACHE_PATH = os.path.join(cache_home,"offpunk/")
+#Debug:
+_CACHE_PATH = "/home/ploum/dev/netcache/"

 if not os.path.exists(_CACHE_PATH):
    print("Creating cache directory {}".format(_CACHE_PATH))
    os.makedirs(_CACHE_PATH)

+# This list is also used as a list of supported protocols
+standard_ports = {
+        "gemini" : 1965,
+        "gopher" : 70,
+        "finger" : 79,
+        "http"   : 80,
+        "https"  : 443,
+        "spartan": 300,
+}
+default_protocol = "gemini"
+
+def parse_mime(mime):
+    options = {}
+    if mime:
+        if ";" in mime:
+            splited = mime.split(";",maxsplit=1)
+            mime = splited[0]
+            if len(splited) >= 1:
+                options_list = splited[1].split()
+                for o in options_list:
+                    spl = o.split("=",maxsplit=1)
+                    if len(spl) > 0:
+                        options[spl[0]] = spl[1]
+    return mime, options
+
+def normalize_url(url):
+    if "://" not in url and ("./" not in url and url[0] != "/"):
+        if not url.startswith("mailto:"):
+            url = "gemini://" + url
+    return url

-#def get(url,max_size_download=None,timeout=None):

 def cache_last_modified(url):
    path = get_cache_path(url)
@ -62,8 +94,10 @@ def get_cache_path(url):
    parsed = urllib.parse.urlparse(url)
    if url[0] == "/" or url.startswith("./"):
        scheme = "file"
-    else:
+    elif parsed.scheme:
        scheme = parsed.scheme
+    else:
+        scheme = default_protocol
    if scheme in ["file","mailto","list"]:
        local = True
        host = ""
@ -151,6 +185,113 @@ def get_cache_path(url):
            cache_path += "/" + index
    return cache_path

+def write_body(url,body,mime=None):
+    ## body is a copy of the raw gemtext
+    ## Write_body() also create the cache !
+    # DEFAULT GEMINI MIME
+    mime, options = parse_mime(mime)
+    cache_path = get_cache_path(url)
+    if cache_path:
+        if mime and mime.startswith("text/"):
+            mode = "w"
+        else:
+            mode = "wb"
+        cache_dir = os.path.dirname(cache_path)
+        # If the subdirectory already exists as a file (not a folder)
+        # We remove it (happens when accessing URL/subfolder before
+        # URL/subfolder/file.gmi.
+        # This causes loss of data in the cache
+        # proper solution would be to save "sufolder" as "sufolder/index.gmi"
+        # If the subdirectory doesn’t exist, we recursively try to find one
+        # until it exists to avoid a file blocking the creation of folders
+        root_dir = cache_dir
+        while not os.path.exists(root_dir):
+            root_dir = os.path.dirname(root_dir)
+        if os.path.isfile(root_dir):
+            os.remove(root_dir)
+        os.makedirs(cache_dir,exist_ok=True)
+        with open(cache_path, mode=mode) as f:
+            f.write(body)
+            f.close()
+        return cache_path
+
+def _fetch_http(url,max_length=None):
+    def set_error(item,length,max_length):
+        err = "Size of %s is %s Mo\n"%(item.url,length)
+        err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
+        err += "To retrieve this content anyway, type 'reload'."
+        item.set_error(err)
+        return item
+    header = {}
+    header["User-Agent"] = "Netcache"
+    parsed = urllib.parse.urlparse(url)
+    # Code to translate URLs to better frontends (think twitter.com -> nitter)
+    #if options["redirects"]:
+    #    netloc = parsed.netloc
+    #   if netloc.startswith("www."):
+#            netloc = netloc[4:]
+#        if netloc in self.redirects:
+#            if self.redirects[netloc] == "blocked":
+#                text = "This website has been blocked.\n"
+#                text += "Use the redirect command to unblock it."
+#                gi.write_body(text,"text/gemini")
+#                return gi
+#            else:
+#                parsed = parsed._replace(netloc = self.redirects[netloc])
+    url = urllib.parse.urlunparse(parsed)
+    with requests.get(url,headers=header, stream=True,timeout=5) as response:
+        #print("This is header for %s"%gi.url)
+        #print(response.headers)
+        if "content-type" in response.headers:
+            mime = response.headers['content-type']
+        else:
+            mime = None
+        if "content-length" in response.headers:
+            length = int(response.headers['content-length'])
+        else:
+            length = 0
+        if max_length and length > max_length:
+            response.close()
+            return set_error(gi,str(length/1000000),max_length)
+        elif max_length and length == 0:
+            body = b''
+            downloaded = 0
+            for r in response.iter_content():
+                body += r
+                #We divide max_size for streamed content
+                #in order to catch them faster
+                size = sys.getsizeof(body)
+                max = max_length/2
+                current = round(size*100/max,1)
+                if current > downloaded:
+                    downloaded = current
+                    print("  -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
+                #print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
+                if size > max_length/2:
+                    response.close()
+                    return set_error(gi,"streaming",max_length)
+            response.close()
+        else:
+            body = response.content
+            response.close()
+    if mime and "text/" in mime:
+        body = body.decode("UTF-8","replace")
+    cache = write_body(url,body,mime)
+    return cache
+
+
+def fetch(url):
+    url = normalize_url(url)
+    if "://" in url:
+        scheme = url.split("://")[0]
+        if scheme in ("http","https"):
+            path=_fetch_http(url)
+            print("Path = %s"%path)
+        else:
+            print("scheme %s not implemented yet")
+    else:
+        print("Not a supproted URL")
+

 def main():
    
@ -165,9 +306,12 @@ def main():
    # --offline : do not attempt to download, return Null if no cached version
    # --validity : returns the date of the cached version, Null if no version
    # --force-download : download and replace cache, even if valid
+    # --max-size-download : cancel download of items above that size. Returns Null.
    args = parser.parse_args()
-
-    print("Download URL: %s" %args.url)
+    
+    for u in args.url:
+        print("Download URL: %s" %u)
+        fetch(u)



--- a/offpunk.py
+++ b/offpunk.py
@ -56,19 +56,6 @@ try:
 except ModuleNotFoundError:
    _HAS_SETPROCTITLE = False

-def parse_mime(mime):
-    options = {}
-    if mime:
-        if ";" in mime:
-            splited = mime.split(";",maxsplit=1)
-            mime = splited[0]
-            if len(splited) >= 1:
-                options_list = splited[1].split()
-                for o in options_list:
-                    spl = o.split("=",maxsplit=1)
-                    if len(spl) > 0:
-                        options[spl[0]] = spl[1]
-    return mime, options

 _HAS_XSEL = shutil.which('xsel')
 _HAS_XDGOPEN = shutil.which('xdg-open')
@ -213,9 +200,6 @@ _FORMAT_RENDERERS = {
 class GeminiItem():

    def __init__(self, url, name=""):
-        if "://" not in url and ("./" not in url and url[0] != "/"):
-            if not url.startswith("mailto:"):
-                url = "gemini://" + url
        self.last_mode = None
        findmode = url.split("##offpunk_mode=")
        if len(findmode) > 1:
@ -447,35 +431,6 @@ class GeminiItem():
            tmpf = cache_path
        return tmpf

-    def write_body(self,body,mime=None):
-        ## body is a copy of the raw gemtext
-        ## Write_body() also create the cache !
-        # DEFAULT GEMINI MIME
-        self.body = body
-        self.mime, options = parse_mime(mime)
-        cache_path = self.get_cache_path()
-        if not self.local and cache_path:
-            if self.mime and self.mime.startswith("text/"):
-                mode = "w"
-            else:
-                mode = "wb"
-            cache_dir = os.path.dirname(self.get_cache_path())
-            # If the subdirectory already exists as a file (not a folder)
-            # We remove it (happens when accessing URL/subfolder before
-            # URL/subfolder/file.gmi.
-            # This causes loss of data in the cache
-            # proper solution would be to save "sufolder" as "sufolder/index.gmi"
-            # If the subdirectory doesn’t exist, we recursively try to find one
-            # until it exists to avoid a file blocking the creation of folders
-            root_dir = cache_dir
-            while not os.path.exists(root_dir):
-                root_dir = os.path.dirname(root_dir)
-            if os.path.isfile(root_dir):
-                os.remove(root_dir)
-            os.makedirs(cache_dir,exist_ok=True)
-            with open(self.get_cache_path(), mode=mode) as f:
-                f.write(body)
-                f.close()

    def get_mime(self):
        #Beware, this one is really a shaddy ad-hoc function
@ -931,69 +886,6 @@ class GeminiClient(cmd.Cmd):
                    print("Handler program %s not found!" % shlex.split(cmd_str)[0])
                    print("You can use the ! command to specify another handler program or pipeline.")

-    def _fetch_http(self,gi,max_length=None):
-        def set_error(item,length,max_length):
-            err = "Size of %s is %s Mo\n"%(item.url,length)
-            err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
-            err += "To retrieve this content anyway, type 'reload'."
-            item.set_error(err)
-            return item
-        header = {}
-        header["User-Agent"] = "Offpunk browser v%s"%__version__
-        parsed = urllib.parse.urlparse(gi.url)
-        # Code to translate URLs to better frontends (think twitter.com -> nitter)
-        if self.options["redirects"]:
-            netloc = parsed.netloc
-            if netloc.startswith("www."):
-                netloc = netloc[4:]
-            if netloc in self.redirects:
-                if self.redirects[netloc] == "blocked":
-                    text = "This website has been blocked.\n"
-                    text += "Use the redirect command to unblock it."
-                    gi.write_body(text,"text/gemini")
-                    return gi
-                else:
-                    parsed = parsed._replace(netloc = self.redirects[netloc])
-        url = urllib.parse.urlunparse(parsed)
-        with requests.get(url,headers=header, stream=True,timeout=5) as response:
-            #print("This is header for %s"%gi.url)
-            #print(response.headers)
-            if "content-type" in response.headers:
-                mime = response.headers['content-type']
-            else:
-                mime = None
-            if "content-length" in response.headers:
-                length = int(response.headers['content-length'])
-            else:
-                length = 0
-            if max_length and length > max_length:
-                response.close()
-                return set_error(gi,str(length/1000000),max_length)
-            elif max_length and length == 0:
-                body = b''
-                downloaded = 0
-                for r in response.iter_content():
-                    body += r
-                    #We divide max_size for streamed content
-                    #in order to catch them faster
-                    size = sys.getsizeof(body)
-                    max = max_length/2
-                    current = round(size*100/max,1)
-                    if current > downloaded:
-                        downloaded = current
-                        print("  -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
-                    #print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
-                    if size > max_length/2:
-                        response.close()
-                        return set_error(gi,"streaming",max_length)
-                response.close()
-            else:
-                body = response.content
-                response.close()
-        if mime and "text/" in mime:
-            body = body.decode("UTF-8","replace")
-        gi.write_body(body,mime)
-        return gi

    def _fetch_gopher(self,gi,timeout=10):
        if not looks_like_url(gi.url):