forked from solderpunk/AV-98
netcache now downloads http
This commit is contained in:
parent
3c760c914a
commit
ef808afe1d
|
@ -1,19 +1,51 @@
|
|||
#!/bin/python
|
||||
import os
|
||||
import urllib
|
||||
import urllib.parse
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
_home = os.path.expanduser('~')
|
||||
cache_home = os.environ.get('XDG_CACHE_HOME') or\
|
||||
os.path.join(_home,'.cache')
|
||||
_CACHE_PATH = os.path.join(cache_home,"offpunk/")
|
||||
#_CACHE_PATH = os.path.join(cache_home,"offpunk/")
|
||||
#Debug:
|
||||
_CACHE_PATH = "/home/ploum/dev/netcache/"
|
||||
|
||||
if not os.path.exists(_CACHE_PATH):
|
||||
print("Creating cache directory {}".format(_CACHE_PATH))
|
||||
os.makedirs(_CACHE_PATH)
|
||||
|
||||
# This list is also used as a list of supported protocols
|
||||
standard_ports = {
|
||||
"gemini" : 1965,
|
||||
"gopher" : 70,
|
||||
"finger" : 79,
|
||||
"http" : 80,
|
||||
"https" : 443,
|
||||
"spartan": 300,
|
||||
}
|
||||
default_protocol = "gemini"
|
||||
|
||||
def parse_mime(mime):
|
||||
options = {}
|
||||
if mime:
|
||||
if ";" in mime:
|
||||
splited = mime.split(";",maxsplit=1)
|
||||
mime = splited[0]
|
||||
if len(splited) >= 1:
|
||||
options_list = splited[1].split()
|
||||
for o in options_list:
|
||||
spl = o.split("=",maxsplit=1)
|
||||
if len(spl) > 0:
|
||||
options[spl[0]] = spl[1]
|
||||
return mime, options
|
||||
|
||||
def normalize_url(url):
|
||||
if "://" not in url and ("./" not in url and url[0] != "/"):
|
||||
if not url.startswith("mailto:"):
|
||||
url = "gemini://" + url
|
||||
return url
|
||||
|
||||
#def get(url,max_size_download=None,timeout=None):
|
||||
|
||||
def cache_last_modified(url):
|
||||
path = get_cache_path(url)
|
||||
|
@ -62,8 +94,10 @@ def get_cache_path(url):
|
|||
parsed = urllib.parse.urlparse(url)
|
||||
if url[0] == "/" or url.startswith("./"):
|
||||
scheme = "file"
|
||||
else:
|
||||
elif parsed.scheme:
|
||||
scheme = parsed.scheme
|
||||
else:
|
||||
scheme = default_protocol
|
||||
if scheme in ["file","mailto","list"]:
|
||||
local = True
|
||||
host = ""
|
||||
|
@ -151,6 +185,113 @@ def get_cache_path(url):
|
|||
cache_path += "/" + index
|
||||
return cache_path
|
||||
|
||||
def write_body(url,body,mime=None):
|
||||
## body is a copy of the raw gemtext
|
||||
## Write_body() also create the cache !
|
||||
# DEFAULT GEMINI MIME
|
||||
mime, options = parse_mime(mime)
|
||||
cache_path = get_cache_path(url)
|
||||
if cache_path:
|
||||
if mime and mime.startswith("text/"):
|
||||
mode = "w"
|
||||
else:
|
||||
mode = "wb"
|
||||
cache_dir = os.path.dirname(cache_path)
|
||||
# If the subdirectory already exists as a file (not a folder)
|
||||
# We remove it (happens when accessing URL/subfolder before
|
||||
# URL/subfolder/file.gmi.
|
||||
# This causes loss of data in the cache
|
||||
# proper solution would be to save "sufolder" as "sufolder/index.gmi"
|
||||
# If the subdirectory doesn’t exist, we recursively try to find one
|
||||
# until it exists to avoid a file blocking the creation of folders
|
||||
root_dir = cache_dir
|
||||
while not os.path.exists(root_dir):
|
||||
root_dir = os.path.dirname(root_dir)
|
||||
if os.path.isfile(root_dir):
|
||||
os.remove(root_dir)
|
||||
os.makedirs(cache_dir,exist_ok=True)
|
||||
with open(cache_path, mode=mode) as f:
|
||||
f.write(body)
|
||||
f.close()
|
||||
return cache_path
|
||||
|
||||
def _fetch_http(url,max_length=None):
|
||||
def set_error(item,length,max_length):
|
||||
err = "Size of %s is %s Mo\n"%(item.url,length)
|
||||
err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
|
||||
err += "To retrieve this content anyway, type 'reload'."
|
||||
item.set_error(err)
|
||||
return item
|
||||
header = {}
|
||||
header["User-Agent"] = "Netcache"
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
# Code to translate URLs to better frontends (think twitter.com -> nitter)
|
||||
#if options["redirects"]:
|
||||
# netloc = parsed.netloc
|
||||
# if netloc.startswith("www."):
|
||||
# netloc = netloc[4:]
|
||||
# if netloc in self.redirects:
|
||||
# if self.redirects[netloc] == "blocked":
|
||||
# text = "This website has been blocked.\n"
|
||||
# text += "Use the redirect command to unblock it."
|
||||
# gi.write_body(text,"text/gemini")
|
||||
# return gi
|
||||
# else:
|
||||
# parsed = parsed._replace(netloc = self.redirects[netloc])
|
||||
url = urllib.parse.urlunparse(parsed)
|
||||
with requests.get(url,headers=header, stream=True,timeout=5) as response:
|
||||
#print("This is header for %s"%gi.url)
|
||||
#print(response.headers)
|
||||
if "content-type" in response.headers:
|
||||
mime = response.headers['content-type']
|
||||
else:
|
||||
mime = None
|
||||
if "content-length" in response.headers:
|
||||
length = int(response.headers['content-length'])
|
||||
else:
|
||||
length = 0
|
||||
if max_length and length > max_length:
|
||||
response.close()
|
||||
return set_error(gi,str(length/1000000),max_length)
|
||||
elif max_length and length == 0:
|
||||
body = b''
|
||||
downloaded = 0
|
||||
for r in response.iter_content():
|
||||
body += r
|
||||
#We divide max_size for streamed content
|
||||
#in order to catch them faster
|
||||
size = sys.getsizeof(body)
|
||||
max = max_length/2
|
||||
current = round(size*100/max,1)
|
||||
if current > downloaded:
|
||||
downloaded = current
|
||||
print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
|
||||
#print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
|
||||
if size > max_length/2:
|
||||
response.close()
|
||||
return set_error(gi,"streaming",max_length)
|
||||
response.close()
|
||||
else:
|
||||
body = response.content
|
||||
response.close()
|
||||
if mime and "text/" in mime:
|
||||
body = body.decode("UTF-8","replace")
|
||||
cache = write_body(url,body,mime)
|
||||
return cache
|
||||
|
||||
|
||||
def fetch(url):
|
||||
url = normalize_url(url)
|
||||
if "://" in url:
|
||||
scheme = url.split("://")[0]
|
||||
if scheme in ("http","https"):
|
||||
path=_fetch_http(url)
|
||||
print("Path = %s"%path)
|
||||
else:
|
||||
print("scheme %s not implemented yet")
|
||||
else:
|
||||
print("Not a supproted URL")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
|
@ -165,9 +306,12 @@ def main():
|
|||
# --offline : do not attempt to download, return Null if no cached version
|
||||
# --validity : returns the date of the cached version, Null if no version
|
||||
# --force-download : download and replace cache, even if valid
|
||||
# --max-size-download : cancel download of items above that size. Returns Null.
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Download URL: %s" %args.url)
|
||||
|
||||
for u in args.url:
|
||||
print("Download URL: %s" %u)
|
||||
fetch(u)
|
||||
|
||||
|
||||
|
||||
|
|
108
offpunk.py
108
offpunk.py
|
@ -56,19 +56,6 @@ try:
|
|||
except ModuleNotFoundError:
|
||||
_HAS_SETPROCTITLE = False
|
||||
|
||||
def parse_mime(mime):
|
||||
options = {}
|
||||
if mime:
|
||||
if ";" in mime:
|
||||
splited = mime.split(";",maxsplit=1)
|
||||
mime = splited[0]
|
||||
if len(splited) >= 1:
|
||||
options_list = splited[1].split()
|
||||
for o in options_list:
|
||||
spl = o.split("=",maxsplit=1)
|
||||
if len(spl) > 0:
|
||||
options[spl[0]] = spl[1]
|
||||
return mime, options
|
||||
|
||||
_HAS_XSEL = shutil.which('xsel')
|
||||
_HAS_XDGOPEN = shutil.which('xdg-open')
|
||||
|
@ -213,9 +200,6 @@ _FORMAT_RENDERERS = {
|
|||
class GeminiItem():
|
||||
|
||||
def __init__(self, url, name=""):
|
||||
if "://" not in url and ("./" not in url and url[0] != "/"):
|
||||
if not url.startswith("mailto:"):
|
||||
url = "gemini://" + url
|
||||
self.last_mode = None
|
||||
findmode = url.split("##offpunk_mode=")
|
||||
if len(findmode) > 1:
|
||||
|
@ -447,35 +431,6 @@ class GeminiItem():
|
|||
tmpf = cache_path
|
||||
return tmpf
|
||||
|
||||
def write_body(self,body,mime=None):
|
||||
## body is a copy of the raw gemtext
|
||||
## Write_body() also create the cache !
|
||||
# DEFAULT GEMINI MIME
|
||||
self.body = body
|
||||
self.mime, options = parse_mime(mime)
|
||||
cache_path = self.get_cache_path()
|
||||
if not self.local and cache_path:
|
||||
if self.mime and self.mime.startswith("text/"):
|
||||
mode = "w"
|
||||
else:
|
||||
mode = "wb"
|
||||
cache_dir = os.path.dirname(self.get_cache_path())
|
||||
# If the subdirectory already exists as a file (not a folder)
|
||||
# We remove it (happens when accessing URL/subfolder before
|
||||
# URL/subfolder/file.gmi.
|
||||
# This causes loss of data in the cache
|
||||
# proper solution would be to save "sufolder" as "sufolder/index.gmi"
|
||||
# If the subdirectory doesn’t exist, we recursively try to find one
|
||||
# until it exists to avoid a file blocking the creation of folders
|
||||
root_dir = cache_dir
|
||||
while not os.path.exists(root_dir):
|
||||
root_dir = os.path.dirname(root_dir)
|
||||
if os.path.isfile(root_dir):
|
||||
os.remove(root_dir)
|
||||
os.makedirs(cache_dir,exist_ok=True)
|
||||
with open(self.get_cache_path(), mode=mode) as f:
|
||||
f.write(body)
|
||||
f.close()
|
||||
|
||||
def get_mime(self):
|
||||
#Beware, this one is really a shaddy ad-hoc function
|
||||
|
@ -931,69 +886,6 @@ class GeminiClient(cmd.Cmd):
|
|||
print("Handler program %s not found!" % shlex.split(cmd_str)[0])
|
||||
print("You can use the ! command to specify another handler program or pipeline.")
|
||||
|
||||
def _fetch_http(self,gi,max_length=None):
|
||||
def set_error(item,length,max_length):
|
||||
err = "Size of %s is %s Mo\n"%(item.url,length)
|
||||
err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000)
|
||||
err += "To retrieve this content anyway, type 'reload'."
|
||||
item.set_error(err)
|
||||
return item
|
||||
header = {}
|
||||
header["User-Agent"] = "Offpunk browser v%s"%__version__
|
||||
parsed = urllib.parse.urlparse(gi.url)
|
||||
# Code to translate URLs to better frontends (think twitter.com -> nitter)
|
||||
if self.options["redirects"]:
|
||||
netloc = parsed.netloc
|
||||
if netloc.startswith("www."):
|
||||
netloc = netloc[4:]
|
||||
if netloc in self.redirects:
|
||||
if self.redirects[netloc] == "blocked":
|
||||
text = "This website has been blocked.\n"
|
||||
text += "Use the redirect command to unblock it."
|
||||
gi.write_body(text,"text/gemini")
|
||||
return gi
|
||||
else:
|
||||
parsed = parsed._replace(netloc = self.redirects[netloc])
|
||||
url = urllib.parse.urlunparse(parsed)
|
||||
with requests.get(url,headers=header, stream=True,timeout=5) as response:
|
||||
#print("This is header for %s"%gi.url)
|
||||
#print(response.headers)
|
||||
if "content-type" in response.headers:
|
||||
mime = response.headers['content-type']
|
||||
else:
|
||||
mime = None
|
||||
if "content-length" in response.headers:
|
||||
length = int(response.headers['content-length'])
|
||||
else:
|
||||
length = 0
|
||||
if max_length and length > max_length:
|
||||
response.close()
|
||||
return set_error(gi,str(length/1000000),max_length)
|
||||
elif max_length and length == 0:
|
||||
body = b''
|
||||
downloaded = 0
|
||||
for r in response.iter_content():
|
||||
body += r
|
||||
#We divide max_size for streamed content
|
||||
#in order to catch them faster
|
||||
size = sys.getsizeof(body)
|
||||
max = max_length/2
|
||||
current = round(size*100/max,1)
|
||||
if current > downloaded:
|
||||
downloaded = current
|
||||
print(" -> Receiving stream: %s%% of allowed data"%downloaded,end='\r')
|
||||
#print("size: %s (%s\% of maxlenght)"%(size,size/max_length))
|
||||
if size > max_length/2:
|
||||
response.close()
|
||||
return set_error(gi,"streaming",max_length)
|
||||
response.close()
|
||||
else:
|
||||
body = response.content
|
||||
response.close()
|
||||
if mime and "text/" in mime:
|
||||
body = body.decode("UTF-8","replace")
|
||||
gi.write_body(body,mime)
|
||||
return gi
|
||||
|
||||
def _fetch_gopher(self,gi,timeout=10):
|
||||
if not looks_like_url(gi.url):
|
||||
|
|
Loading…
Reference in New Issue