From e779a45a3564b2fa475280b2c51eec33cb7b2e84 Mon Sep 17 00:00:00 2001
From: Solderpunk <solderpunk@sdf.org>
Date: Sat, 22 Jun 2019 16:29:14 +0300
Subject: [PATCH] Use text encoding declared in the response header, and treat
 decoding errors as fatal.

---
 av98.py | 66 ++++++++++++---------------------------------------------
 1 file changed, 14 insertions(+), 52 deletions(-)

diff --git a/av98.py b/av98.py
index 5d855d1..66dfd10 100755
--- a/av98.py
+++ b/av98.py
@@ -10,6 +10,7 @@
 
 import argparse
 import cmd
+import cgi
 import codecs
 import collections
 import fnmatch
@@ -27,13 +28,6 @@ import urllib.parse
 import ssl
 import time
 
-# Use chardet if it's there, but don't depend on it
-try:
-    import chardet
-    _HAS_CHARDET = True
-except ImportError:
-    _HAS_CHARDET = False
-
 # Command abbreviations
 _ABBREVS = {
     "a":    "add",
@@ -237,13 +231,13 @@ class GeminiClient(cmd.Cmd):
             self._debug("Response header: %s." % header)
             body = f.read()
             status, mime = header.split("\t")
-
-#            except UnicodeError:
-#                print("""ERROR: Unknown text encoding!
-#If you know the correct encoding, use e.g. 'set encoding koi8-r' and
-#try again.  Otherwise, install the 'chardet' library for Python 3 to
-#enable automatic encoding detection.""")
-#                return
+            mime, mime_options = cgi.parse_header(mime)
+            if "charset" in mime_options:
+                try:
+                    codecs.lookup(mime_options["charset"])
+                except LookupError:
+                    print("Header declared unknown encoding %s" % value)
+                    return
 
         # Catch network errors which may be recoverable if a redundant
         # mirror is specified
@@ -294,8 +288,12 @@ Slow internet connection?  Use 'set timeout' to be more patient.""")
         ## Set file mode
         if mime.startswith("text/"):
             mode = "w"
-            encoding = "UTF-8"
-            body = body.decode(encoding)
+            encoding = mime_options.get("charset", "UTF-8")
+            try:
+                body = body.decode(encoding)
+            except UnicodeError:
+                print("Could not decode response body using %s encoding declared in header!" % encoding)
+                return
         else:
             mode = "wb"
             encoding = None
@@ -390,42 +388,6 @@ Slow internet connection?  Use 'set timeout' to be more patient.""")
         self._debug("Using handler: %s" % cmd_str)
         return cmd_str
 
-    def _decode_text(self, f):
-        # Attempt to decode some bytes into a Unicode string.
-        # First of all, try UTF-8 as the default.
-        # If this fails, attempt to autodetect the encoding if chardet
-        # library is installed.
-        # If chardet is not installed, or fails to work, fall back on
-        # the user-specified alternate encoding.
-        # If none of this works, this will raise UnicodeError and it's
-        # up to the caller to handle it gracefully.
-        raw_bytes = f.read()
-        # Try UTF-8 first:
-        try:
-            text = raw_bytes.decode("UTF-8")
-        except UnicodeError:
-            # If we have chardet, try the magic
-            self._debug("Could not decode response as UTF-8.")
-            if _HAS_CHARDET:
-                autodetect = chardet.detect(raw_bytes)
-                # Make sure we're vaguely certain
-                if autodetect["confidence"] > 0.5:
-                    self._debug("Trying encoding %s as recommended by chardet." % autodetect["encoding"])
-                    text = raw_bytes.decode(autodetect["encoding"])
-                else:
-                    # Try the user-specified encoding
-                    self._debug("Trying fallback encoding %s." % self.options["encoding"])
-                    text = raw_bytes.decode(self.options["encoding"])
-            else:
-                # Try the user-specified encoding
-                text = raw_bytes.decode(self.options["encoding"])
-        if not text.endswith("\n"):
-            text += CRLF
-        new_f = io.StringIO()
-        new_f.write(text)
-        new_f.seek(0)
-        return new_f
-
     def _handle_index(self, body, menu_gi):
         self.index = []
         if self.idx_filename: