netcache now downloads http

This commit is contained in:
Lionel Dricot 2023-07-08 00:43:56 +02:00
parent 3c760c914a
commit ef808afe1d
2 changed files with 150 additions and 114 deletions

156
netcache.py Normal file → Executable file
View File

@ -1,19 +1,51 @@
#!/bin/python
import os
import urllib
import urllib.parse
import argparse
import requests
_home = os.path.expanduser('~')
cache_home = os.environ.get('XDG_CACHE_HOME') or\
os.path.join(_home,'.cache')
_CACHE_PATH = os.path.join(cache_home,"offpunk/")
#_CACHE_PATH = os.path.join(cache_home,"offpunk/")
#Debug:
_CACHE_PATH = "/home/ploum/dev/netcache/"
if not os.path.exists(_CACHE_PATH):
print("Creating cache directory {}".format(_CACHE_PATH))
os.makedirs(_CACHE_PATH)
# This list is also used as a list of supported protocols
standard_ports = {
"gemini" : 1965,
"gopher" : 70,
"finger" : 79,
"http" : 80,
"https" : 443,
"spartan": 300,
}
default_protocol = "gemini"
def parse_mime(mime):
options = {}
if mime:
if ";" in mime:
splited = mime.split(";",maxsplit=1)
mime = splited[0]
if len(splited) >= 1:
options_list = splited[1].split()
for o in options_list:
spl = o.split("=",maxsplit=1)
if len(spl) > 0:
options[spl[0]] = spl[1]
return mime, options
def normalize_url(url):
if "://" not in url and ("./" not in url and url[0] != "/"):
if not url.startswith("mailto:"):
url = "gemini://" + url
return url
#def get(url,max_size_download=None,timeout=None):
def cache_last_modified(url):
path = get_cache_path(url)
@ -62,8 +94,10 @@ def get_cache_path(url):
parsed = urllib.parse.urlparse(url)
if url[0] == "/" or url.startswith("./"):
scheme = "file"
else:
elif parsed.scheme:
scheme = parsed.scheme
else:
scheme = default_protocol
if scheme in ["file","mailto","list"]:
local = True
host = ""
@ -151,6 +185,113 @@ def get_cache_path(url):
cache_path += "/" + index
return cache_path
def write_body(url,body,mime=None):
## body is a copy of the raw gemtext
## Write_body() also create the cache !
#DEFAULT GEMINIMIME
mime, options = parse_mime(mime)
cache_path = get_cache_path(url)
if cache_path:
if mime and mime.startswith("text/"):
mode = "w"
else:
mode = "wb"
cache_dir = os.path.dirname(cache_path)
# If the subdirectory already exists as a file (not a folder)
# We remove it (happens when accessing URL/subfolder before
# URL/subfolder/file.gmi.
# This causes loss of data in the cache
# proper solution would be to save "sufolder" as "sufolder/index.gmi"
# If the subdirectory doesnt exist, we recursively try to find one
# until it exists to avoid a file blocking the creation of folders
root_dir = cache_dir
while not os.path.exists(root_dir):
root_dir = os.path.dirname(root_dir)
if os.path.isfile(root_dir):
os.remove(root_dir)
os.makedirs(cache_dir,exist_ok=True)
with open(cache_path, mode=mode) as f:
f.write(body)
f.close()
return cache_path
def _fetch_http(url,max_length=None):
def set_error(item,length,max_length):
err = "Size of %s is %s Mo\n"%(item.url,length)
err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
err += "To retrieve this content anyway, type 'reload'."
item.set_error(err)
return item
header = {}
header["User-Agent"] = "Netcache"
parsed = urllib.parse.urlparse(url)
# Code to translate URLs to better frontends (think twitter.com -> nitter)
#if options["redirects"]:
# netloc = parsed.netloc
# if netloc.startswith("www."):
# netloc = netloc[4:]
# if netloc in self.redirects:
# if self.redirects[netloc] == "blocked":
# text = "This website has been blocked.\n"
# text += "Use the redirect command to unblock it."
# gi.write_body(text,"text/gemini")
# return gi
# else:
# parsed = parsed._replace(netloc = self.redirects[netloc])
url = urllib.parse.urlunparse(parsed)
with requests.get(url,headers=header, stream=True,timeout=5) as response:
#print("This is header for %s"%gi.url)
#print(response.headers)
if "content-type" in response.headers:
mime = response.headers['content-type']
else:
mime = None
if "content-length" in response.headers:
length = int(response.headers['content-length'])
else:
length = 0
if max_length and length > max_length:
response.close()
return set_error(gi,str(length/1000000),max_length)
elif max_length and length == 0:
body = b''
downloaded = 0
for r in response.iter_content():
body += r
#We divide max_size for streamed content
#in order to catch them faster
size = sys.getsizeof(body)
max = max_length/2
current = round(size*100/max,1)
if current > downloaded:
downloaded = current
print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
#print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
if size > max_length/2:
response.close()
return set_error(gi,"streaming",max_length)
response.close()
else:
body = response.content
response.close()
if mime and "text/" in mime:
body = body.decode("UTF-8","replace")
cache = write_body(url,body,mime)
return cache
def fetch(url):
url = normalize_url(url)
if "://" in url:
scheme = url.split("://")[0]
if scheme in ("http","https"):
path=_fetch_http(url)
print("Path = %s"%path)
else:
print("scheme %s not implemented yet")
else:
print("Not a supproted URL")
def main():
@ -165,9 +306,12 @@ def main():
# --offline : do not attempt to download, return Null if no cached version
# --validity : returns the date of the cached version, Null if no version
# --force-download : download and replace cache, even if valid
# --max-size-download : cancel download of items above that size. Returns Null.
args = parser.parse_args()
print("Download URL: %s" %args.url)
for u in args.url:
print("Download URL: %s" %u)
fetch(u)

View File

@ -56,19 +56,6 @@ try:
except ModuleNotFoundError:
_HAS_SETPROCTITLE = False
def parse_mime(mime):
options = {}
if mime:
if ";" in mime:
splited = mime.split(";",maxsplit=1)
mime = splited[0]
if len(splited) >= 1:
options_list = splited[1].split()
for o in options_list:
spl = o.split("=",maxsplit=1)
if len(spl) > 0:
options[spl[0]] = spl[1]
return mime, options
_HAS_XSEL = shutil.which('xsel')
_HAS_XDGOPEN = shutil.which('xdg-open')
@ -213,9 +200,6 @@ _FORMAT_RENDERERS = {
class GeminiItem():
def __init__(self, url, name=""):
if "://" not in url and ("./" not in url and url[0] != "/"):
if not url.startswith("mailto:"):
url = "gemini://" + url
self.last_mode = None
findmode = url.split("##offpunk_mode=")
if len(findmode) > 1:
@ -447,35 +431,6 @@ class GeminiItem():
tmpf = cache_path
return tmpf
def write_body(self,body,mime=None):
## body is a copy of the raw gemtext
## Write_body() also create the cache !
#DEFAULT GEMINIMIME
self.body = body
self.mime, options = parse_mime(mime)
cache_path = self.get_cache_path()
if not self.local and cache_path:
if self.mime and self.mime.startswith("text/"):
mode = "w"
else:
mode = "wb"
cache_dir = os.path.dirname(self.get_cache_path())
# If the subdirectory already exists as a file (not a folder)
# We remove it (happens when accessing URL/subfolder before
# URL/subfolder/file.gmi.
# This causes loss of data in the cache
# proper solution would be to save "sufolder" as "sufolder/index.gmi"
# If the subdirectory doesnt exist, we recursively try to find one
# until it exists to avoid a file blocking the creation of folders
root_dir = cache_dir
while not os.path.exists(root_dir):
root_dir = os.path.dirname(root_dir)
if os.path.isfile(root_dir):
os.remove(root_dir)
os.makedirs(cache_dir,exist_ok=True)
with open(self.get_cache_path(), mode=mode) as f:
f.write(body)
f.close()
def get_mime(self):
#Beware, this one is really a shaddy ad-hoc function
@ -931,69 +886,6 @@ class GeminiClient(cmd.Cmd):
print("Handler program %s not found!" % shlex.split(cmd_str)[0])
print("You can use the ! command to specify another handler program or pipeline.")
def _fetch_http(self,gi,max_length=None):
def set_error(item,length,max_length):
err = "Size of %s is %s Mo\n"%(item.url,length)
err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
err += "To retrieve this content anyway, type 'reload'."
item.set_error(err)
return item
header = {}
header["User-Agent"] = "Offpunk browser v%s"%__version__
parsed = urllib.parse.urlparse(gi.url)
# Code to translate URLs to better frontends (think twitter.com -> nitter)
if self.options["redirects"]:
netloc = parsed.netloc
if netloc.startswith("www."):
netloc = netloc[4:]
if netloc in self.redirects:
if self.redirects[netloc] == "blocked":
text = "This website has been blocked.\n"
text += "Use the redirect command to unblock it."
gi.write_body(text,"text/gemini")
return gi
else:
parsed = parsed._replace(netloc = self.redirects[netloc])
url = urllib.parse.urlunparse(parsed)
with requests.get(url,headers=header, stream=True,timeout=5) as response:
#print("This is header for %s"%gi.url)
#print(response.headers)
if "content-type" in response.headers:
mime = response.headers['content-type']
else:
mime = None
if "content-length" in response.headers:
length = int(response.headers['content-length'])
else:
length = 0
if max_length and length > max_length:
response.close()
return set_error(gi,str(length/1000000),max_length)
elif max_length and length == 0:
body = b''
downloaded = 0
for r in response.iter_content():
body += r
#We divide max_size for streamed content
#in order to catch them faster
size = sys.getsizeof(body)
max = max_length/2
current = round(size*100/max,1)
if current > downloaded:
downloaded = current
print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
#print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
if size > max_length/2:
response.close()
return set_error(gi,"streaming",max_length)
response.close()
else:
body = response.content
response.close()
if mime and "text/" in mime:
body = body.decode("UTF-8","replace")
gi.write_body(body,mime)
return gi
def _fetch_gopher(self,gi,timeout=10):
if not looks_like_url(gi.url):