Use text encoding declared in the response header, and treat decoding errors as fatal.

2019-06-22 16:29:14 +03:00 · 2019-06-22 16:29:14 +03:00 · e779a45a35
parent 50be14a327
commit e779a45a35
1 changed files with 14 additions and 52 deletions
--- a/av98.py
+++ b/av98.py
@ -10,6 +10,7 @@

 import argparse
 import cmd
+import cgi
 import codecs
 import collections
 import fnmatch
@ -27,13 +28,6 @@ import urllib.parse
 import ssl
 import time

-# Use chardet if it's there, but don't depend on it
-try:
-    import chardet
-    _HAS_CHARDET = True
-except ImportError:
-    _HAS_CHARDET = False
-
 # Command abbreviations
 _ABBREVS = {
    "a":    "add",
@ -237,13 +231,13 @@ class GeminiClient(cmd.Cmd):
            self._debug("Response header: %s." % header)
            body = f.read()
            status, mime = header.split("\t")
-
-#            except UnicodeError:
-#                print("""ERROR: Unknown text encoding!
-#If you know the correct encoding, use e.g. 'set encoding koi8-r' and
-#try again.  Otherwise, install the 'chardet' library for Python 3 to
-#enable automatic encoding detection.""")
-#                return
+            mime, mime_options = cgi.parse_header(mime)
+            if "charset" in mime_options:
+                try:
+                    codecs.lookup(mime_options["charset"])
+                except LookupError:
+                    print("Header declared unknown encoding %s" % value)
+                    return

        # Catch network errors which may be recoverable if a redundant
        # mirror is specified
@ -294,8 +288,12 @@ Slow internet connection?  Use 'set timeout' to be more patient.""")
        ## Set file mode
        if mime.startswith("text/"):
            mode = "w"
-            encoding = "UTF-8"
-            body = body.decode(encoding)
+            encoding = mime_options.get("charset", "UTF-8")
+            try:
+                body = body.decode(encoding)
+            except UnicodeError:
+                print("Could not decode response body using %s encoding declared in header!" % encoding)
+                return
        else:
            mode = "wb"
            encoding = None
@ -390,42 +388,6 @@ Slow internet connection?  Use 'set timeout' to be more patient.""")
        self._debug("Using handler: %s" % cmd_str)
        return cmd_str

-    def _decode_text(self, f):
-        # Attempt to decode some bytes into a Unicode string.
-        # First of all, try UTF-8 as the default.
-        # If this fails, attempt to autodetect the encoding if chardet
-        # library is installed.
-        # If chardet is not installed, or fails to work, fall back on
-        # the user-specified alternate encoding.
-        # If none of this works, this will raise UnicodeError and it's
-        # up to the caller to handle it gracefully.
-        raw_bytes = f.read()
-        # Try UTF-8 first:
-        try:
-            text = raw_bytes.decode("UTF-8")
-        except UnicodeError:
-            # If we have chardet, try the magic
-            self._debug("Could not decode response as UTF-8.")
-            if _HAS_CHARDET:
-                autodetect = chardet.detect(raw_bytes)
-                # Make sure we're vaguely certain
-                if autodetect["confidence"] > 0.5:
-                    self._debug("Trying encoding %s as recommended by chardet." % autodetect["encoding"])
-                    text = raw_bytes.decode(autodetect["encoding"])
-                else:
-                    # Try the user-specified encoding
-                    self._debug("Trying fallback encoding %s." % self.options["encoding"])
-                    text = raw_bytes.decode(self.options["encoding"])
-            else:
-                # Try the user-specified encoding
-                text = raw_bytes.decode(self.options["encoding"])
-        if not text.endswith("\n"):
-            text += CRLF
-        new_f = io.StringIO()
-        new_f.write(text)
-        new_f.seek(0)
-        return new_f
-
    def _handle_index(self, body, menu_gi):
        self.index = []
        if self.idx_filename: