From d28a3423d6a9da75453648103f7fac79a26b9962 Mon Sep 17 00:00:00 2001 From: Lionel Dricot Date: Thu, 3 Mar 2022 16:16:32 +0100 Subject: [PATCH] Http download which are not to be put in tour are now limited to 20Mo, to avoid downloading large unwanted files --- CHANGELOG | 2 +- offpunk.py | 25 +++++++++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0d2e9ee..96e5891 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ # Offpunk History ## 0.5 - Unreleased -- When syncing, deep http links with content above 20Mo are not downloaded. +- Http links with content above 20Mo are not downloaded during sync (except when explicitely requested) - Improving subscriptions with more feedback and better detection - Avoid deprecated SSL methods (thanks Phoebos for the report) - Fixed multiple crashes diff --git a/offpunk.py b/offpunk.py index c613cf0..f54f159 100755 --- a/offpunk.py +++ b/offpunk.py @@ -1455,7 +1455,8 @@ class GeminiClient(cmd.Cmd): "http_proxy": None, "https_everywhere": False, "archives_size" : 100, - "history_size" : 100 + "history_size" : 100, + "max_size_download " : 20, } global TERM_WIDTH TERM_WIDTH = self.options["width"] @@ -1486,7 +1487,8 @@ class GeminiClient(cmd.Cmd): (hostname text, address text, fingerprint text, first_seen date, last_seen date, count integer)""") - def _go_to_gi(self, gi, update_hist=True, check_cache=True, handle=True,readable=True): + def _go_to_gi(self, gi, update_hist=True, check_cache=True, handle=True,\ + readable=True,limit_size=False): """This method might be considered "the heart of Offpunk". Everything involved in fetching a gemini resource happens here: sending the request over the network, parsing the response, @@ -1545,9 +1547,9 @@ class GeminiClient(cmd.Cmd): try: if gi.scheme in ("http", "https"): if self.support_http: - if self.sync_only: + if limit_size: # Let’s cap automatic downloads to 20Mo - max_download = 20000000 + max_download = int(self.options["max_size_download"])*1000000 else: max_download = None gi = self._fetch_http(gi,max_length=max_download) @@ -1648,9 +1650,10 @@ class GeminiClient(cmd.Cmd): else: length = 0 if max_length and length > max_length: + print("TEST : %s has been cancelled because its size is above limit"%gi.url) response.close() - err = "Size of %s is %s ko\n"%(gi.url,length/1000) - err += "Offpunk only download automatically content under %s\n" %max_length + err = "Size of %s is %s Mo\n"%(gi.url,length/1000000) + err += "Offpunk only download automatically content under %s Mo\n" %(max_length/1000000) err += "To retrieve this content anyway, type 'reload'." gi.set_error(err) return gi @@ -3405,16 +3408,18 @@ Argument : duration of cache validity (in seconds).""" #Did we already had a cache (even an old one) ? isnew = not gitem.is_cache_valid() print("%s [%s/%s] Fetch "%(strin,count[0],count[1]),gitem.url,end=endline) - self._go_to_gi(gitem,update_hist=False) + #If not saving to tour, then we should limit download size + limit = not savetotour + self._go_to_gi(gitem,update_hist=False,limit_size=limit) if savetotour and isnew and gitem.is_cache_valid(): #we add to the next tour only if we managed to cache #the ressource add_to_tour(gitem) #Now, recursive call, even if we didn’t refresh the cache if depth > 0: - #we only savetotour at the first level of recursion - if depth > 1: - savetotour=False + #we should only savetotour at the first level of recursion + # The code for this was removed so, currently, we savetotour + # at every level of recursion. links = gitem.get_links() subcount = [0,len(links)] d = depth - 1