From ef808afe1dd98b2fe401f03856c909f46a4ff7af Mon Sep 17 00:00:00 2001
From: Lionel Dricot <git@ploum.eu>
Date: Sat, 8 Jul 2023 00:43:56 +0200
Subject: [PATCH] netcache now downloads http

---
 netcache.py | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 offpunk.py  | 108 ------------------------------------
 2 files changed, 150 insertions(+), 114 deletions(-)
 mode change 100644 => 100755 netcache.py

diff --git a/netcache.py b/netcache.py
old mode 100644
new mode 100755
index 96b65c8..19b008c
--- a/netcache.py
+++ b/netcache.py
@@ -1,19 +1,51 @@
 #!/bin/python
 import os
-import urllib
+import urllib.parse
 import argparse
+import requests
 
 _home = os.path.expanduser('~')
 cache_home = os.environ.get('XDG_CACHE_HOME') or\
                 os.path.join(_home,'.cache')
-_CACHE_PATH = os.path.join(cache_home,"offpunk/")
+#_CACHE_PATH = os.path.join(cache_home,"offpunk/")
+#Debug:
+_CACHE_PATH = "/home/ploum/dev/netcache/"
 
 if not os.path.exists(_CACHE_PATH):
     print("Creating cache directory {}".format(_CACHE_PATH))
     os.makedirs(_CACHE_PATH)
 
+# This list is also used as a list of supported protocols
+standard_ports = {
+        "gemini" : 1965,
+        "gopher" : 70,
+        "finger" : 79,
+        "http"   : 80,
+        "https"  : 443,
+        "spartan": 300,
+}
+default_protocol = "gemini"
+
+def parse_mime(mime):
+    options = {}
+    if mime:
+        if ";" in mime:
+            splited = mime.split(";",maxsplit=1)
+            mime = splited[0]
+            if len(splited) >= 1:
+                options_list = splited[1].split()
+                for o in options_list:
+                    spl = o.split("=",maxsplit=1)
+                    if len(spl) > 0:
+                        options[spl[0]] = spl[1]
+    return mime, options
+
+def normalize_url(url):
+    if "://" not in url and ("./" not in url and url[0] != "/"):
+        if not url.startswith("mailto:"):
+            url = "gemini://" + url
+    return url
 
-#def get(url,max_size_download=None,timeout=None):
 
 def cache_last_modified(url):
     path = get_cache_path(url)
@@ -62,8 +94,10 @@ def get_cache_path(url):
     parsed = urllib.parse.urlparse(url)
     if url[0] == "/" or url.startswith("./"):
         scheme = "file"
-    else:
+    elif parsed.scheme:
         scheme = parsed.scheme
+    else:
+        scheme = default_protocol
     if scheme in ["file","mailto","list"]:
         local = True
         host = ""
@@ -151,6 +185,113 @@ def get_cache_path(url):
             cache_path += "/" + index
     return cache_path
 
+def write_body(url,body,mime=None):
+    ## body is a copy of the raw gemtext
+    ## Write_body() also create the cache !
+    # DEFAULT GEMINI MIME
+    mime, options = parse_mime(mime)
+    cache_path = get_cache_path(url)
+    if cache_path:
+        if mime and mime.startswith("text/"):
+            mode = "w"
+        else:
+            mode = "wb"
+        cache_dir = os.path.dirname(cache_path)
+        # If the subdirectory already exists as a file (not a folder)
+        # We remove it (happens when accessing URL/subfolder before
+        # URL/subfolder/file.gmi.
+        # This causes loss of data in the cache
+        # proper solution would be to save "sufolder" as "sufolder/index.gmi"
+        # If the subdirectory doesn’t exist, we recursively try to find one
+        # until it exists to avoid a file blocking the creation of folders
+        root_dir = cache_dir
+        while not os.path.exists(root_dir):
+            root_dir = os.path.dirname(root_dir)
+        if os.path.isfile(root_dir):
+            os.remove(root_dir)
+        os.makedirs(cache_dir,exist_ok=True)
+        with open(cache_path, mode=mode) as f:
+            f.write(body)
+            f.close()
+        return cache_path
+
+def _fetch_http(url,max_length=None):
+    def set_error(item,length,max_length):
+        err = "Size of %s is %s Mo\n"%(item.url,length)
+        err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
+        err += "To retrieve this content anyway, type 'reload'."
+        item.set_error(err)
+        return item
+    header = {}
+    header["User-Agent"] = "Netcache"
+    parsed = urllib.parse.urlparse(url)
+    # Code to translate URLs to better frontends (think twitter.com -> nitter)
+    #if options["redirects"]:
+    #    netloc = parsed.netloc
+    #   if netloc.startswith("www."):
+#            netloc = netloc[4:]
+#        if netloc in self.redirects:
+#            if self.redirects[netloc] == "blocked":
+#                text = "This website has been blocked.\n"
+#                text += "Use the redirect command to unblock it."
+#                gi.write_body(text,"text/gemini")
+#                return gi
+#            else:
+#                parsed = parsed._replace(netloc = self.redirects[netloc])
+    url = urllib.parse.urlunparse(parsed)
+    with requests.get(url,headers=header, stream=True,timeout=5) as response:
+        #print("This is header for %s"%gi.url)
+        #print(response.headers)
+        if "content-type" in response.headers:
+            mime = response.headers['content-type']
+        else:
+            mime = None
+        if "content-length" in response.headers:
+            length = int(response.headers['content-length'])
+        else:
+            length = 0
+        if max_length and length > max_length:
+            response.close()
+            return set_error(gi,str(length/1000000),max_length)
+        elif max_length and length == 0:
+            body = b''
+            downloaded = 0
+            for r in response.iter_content():
+                body += r
+                #We divide max_size for streamed content
+                #in order to catch them faster
+                size = sys.getsizeof(body)
+                max = max_length/2
+                current = round(size*100/max,1)
+                if current > downloaded:
+                    downloaded = current
+                    print("  -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
+                #print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
+                if size > max_length/2:
+                    response.close()
+                    return set_error(gi,"streaming",max_length)
+            response.close()
+        else:
+            body = response.content
+            response.close()
+    if mime and "text/" in mime:
+        body = body.decode("UTF-8","replace")
+    cache = write_body(url,body,mime)
+    return cache
+
+
+def fetch(url):
+    url = normalize_url(url)
+    if "://" in url:
+        scheme = url.split("://")[0]
+        if scheme in ("http","https"):
+            path=_fetch_http(url)
+            print("Path = %s"%path)
+        else:
+            print("scheme %s not implemented yet")
+    else:
+        print("Not a supproted URL")
+
 
 def main():
     
@@ -165,9 +306,12 @@ def main():
     # --offline : do not attempt to download, return Null if no cached version
     # --validity : returns the date of the cached version, Null if no version
     # --force-download : download and replace cache, even if valid
+    # --max-size-download : cancel download of items above that size. Returns Null.
     args = parser.parse_args()
-
-    print("Download URL: %s" %args.url)
+    
+    for u in args.url:
+        print("Download URL: %s" %u)
+        fetch(u)
 
 
 
diff --git a/offpunk.py b/offpunk.py
index a166669..87353ba 100755
--- a/offpunk.py
+++ b/offpunk.py
@@ -56,19 +56,6 @@ try:
 except ModuleNotFoundError:
     _HAS_SETPROCTITLE = False
 
-def parse_mime(mime):
-    options = {}
-    if mime:
-        if ";" in mime:
-            splited = mime.split(";",maxsplit=1)
-            mime = splited[0]
-            if len(splited) >= 1:
-                options_list = splited[1].split()
-                for o in options_list:
-                    spl = o.split("=",maxsplit=1)
-                    if len(spl) > 0:
-                        options[spl[0]] = spl[1]
-    return mime, options
 
 _HAS_XSEL = shutil.which('xsel')
 _HAS_XDGOPEN = shutil.which('xdg-open')
@@ -213,9 +200,6 @@ _FORMAT_RENDERERS = {
 class GeminiItem():
 
     def __init__(self, url, name=""):
-        if "://" not in url and ("./" not in url and url[0] != "/"):
-            if not url.startswith("mailto:"):
-                url = "gemini://" + url
         self.last_mode = None
         findmode = url.split("##offpunk_mode=")
         if len(findmode) > 1:
@@ -447,35 +431,6 @@ class GeminiItem():
             tmpf = cache_path
         return tmpf
 
-    def write_body(self,body,mime=None):
-        ## body is a copy of the raw gemtext
-        ## Write_body() also create the cache !
-        # DEFAULT GEMINI MIME
-        self.body = body
-        self.mime, options = parse_mime(mime)
-        cache_path = self.get_cache_path()
-        if not self.local and cache_path:
-            if self.mime and self.mime.startswith("text/"):
-                mode = "w"
-            else:
-                mode = "wb"
-            cache_dir = os.path.dirname(self.get_cache_path())
-            # If the subdirectory already exists as a file (not a folder)
-            # We remove it (happens when accessing URL/subfolder before
-            # URL/subfolder/file.gmi.
-            # This causes loss of data in the cache
-            # proper solution would be to save "sufolder" as "sufolder/index.gmi"
-            # If the subdirectory doesn’t exist, we recursively try to find one
-            # until it exists to avoid a file blocking the creation of folders
-            root_dir = cache_dir
-            while not os.path.exists(root_dir):
-                root_dir = os.path.dirname(root_dir)
-            if os.path.isfile(root_dir):
-                os.remove(root_dir)
-            os.makedirs(cache_dir,exist_ok=True)
-            with open(self.get_cache_path(), mode=mode) as f:
-                f.write(body)
-                f.close()
 
     def get_mime(self):
         #Beware, this one is really a shaddy ad-hoc function
@@ -931,69 +886,6 @@ class GeminiClient(cmd.Cmd):
                     print("Handler program %s not found!" % shlex.split(cmd_str)[0])
                     print("You can use the ! command to specify another handler program or pipeline.")
 
-    def _fetch_http(self,gi,max_length=None):
-        def set_error(item,length,max_length):
-            err = "Size of %s is %s Mo\n"%(item.url,length)
-            err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
-            err += "To retrieve this content anyway, type 'reload'."
-            item.set_error(err)
-            return item
-        header = {}
-        header["User-Agent"] = "Offpunk browser v%s"%__version__
-        parsed = urllib.parse.urlparse(gi.url)
-        # Code to translate URLs to better frontends (think twitter.com -> nitter)
-        if self.options["redirects"]:
-            netloc = parsed.netloc
-            if netloc.startswith("www."):
-                netloc = netloc[4:]
-            if netloc in self.redirects:
-                if self.redirects[netloc] == "blocked":
-                    text = "This website has been blocked.\n"
-                    text += "Use the redirect command to unblock it."
-                    gi.write_body(text,"text/gemini")
-                    return gi
-                else:
-                    parsed = parsed._replace(netloc = self.redirects[netloc])
-        url = urllib.parse.urlunparse(parsed)
-        with requests.get(url,headers=header, stream=True,timeout=5) as response:
-            #print("This is header for %s"%gi.url)
-            #print(response.headers)
-            if "content-type" in response.headers:
-                mime = response.headers['content-type']
-            else:
-                mime = None
-            if "content-length" in response.headers:
-                length = int(response.headers['content-length'])
-            else:
-                length = 0
-            if max_length and length > max_length:
-                response.close()
-                return set_error(gi,str(length/1000000),max_length)
-            elif max_length and length == 0:
-                body = b''
-                downloaded = 0
-                for r in response.iter_content():
-                    body += r
-                    #We divide max_size for streamed content
-                    #in order to catch them faster
-                    size = sys.getsizeof(body)
-                    max = max_length/2
-                    current = round(size*100/max,1)
-                    if current > downloaded:
-                        downloaded = current
-                        print("  -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
-                    #print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
-                    if size > max_length/2:
-                        response.close()
-                        return set_error(gi,"streaming",max_length)
-                response.close()
-            else:
-                body = response.content
-                response.close()
-        if mime and "text/" in mime:
-            body = body.decode("UTF-8","replace")
-        gi.write_body(body,mime)
-        return gi
 
     def _fetch_gopher(self,gi,timeout=10):
         if not looks_like_url(gi.url):