Use text encoding declared in the response header, and treat decoding errors as fatal.
This commit is contained in:
parent
50be14a327
commit
e779a45a35
66
av98.py
66
av98.py
|
@ -10,6 +10,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import cmd
|
import cmd
|
||||||
|
import cgi
|
||||||
import codecs
|
import codecs
|
||||||
import collections
|
import collections
|
||||||
import fnmatch
|
import fnmatch
|
||||||
|
@ -27,13 +28,6 @@ import urllib.parse
|
||||||
import ssl
|
import ssl
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Use chardet if it's there, but don't depend on it
|
|
||||||
try:
|
|
||||||
import chardet
|
|
||||||
_HAS_CHARDET = True
|
|
||||||
except ImportError:
|
|
||||||
_HAS_CHARDET = False
|
|
||||||
|
|
||||||
# Command abbreviations
|
# Command abbreviations
|
||||||
_ABBREVS = {
|
_ABBREVS = {
|
||||||
"a": "add",
|
"a": "add",
|
||||||
|
@ -237,13 +231,13 @@ class GeminiClient(cmd.Cmd):
|
||||||
self._debug("Response header: %s." % header)
|
self._debug("Response header: %s." % header)
|
||||||
body = f.read()
|
body = f.read()
|
||||||
status, mime = header.split("\t")
|
status, mime = header.split("\t")
|
||||||
|
mime, mime_options = cgi.parse_header(mime)
|
||||||
# except UnicodeError:
|
if "charset" in mime_options:
|
||||||
# print("""ERROR: Unknown text encoding!
|
try:
|
||||||
#If you know the correct encoding, use e.g. 'set encoding koi8-r' and
|
codecs.lookup(mime_options["charset"])
|
||||||
#try again. Otherwise, install the 'chardet' library for Python 3 to
|
except LookupError:
|
||||||
#enable automatic encoding detection.""")
|
print("Header declared unknown encoding %s" % value)
|
||||||
# return
|
return
|
||||||
|
|
||||||
# Catch network errors which may be recoverable if a redundant
|
# Catch network errors which may be recoverable if a redundant
|
||||||
# mirror is specified
|
# mirror is specified
|
||||||
|
@ -294,8 +288,12 @@ Slow internet connection? Use 'set timeout' to be more patient.""")
|
||||||
## Set file mode
|
## Set file mode
|
||||||
if mime.startswith("text/"):
|
if mime.startswith("text/"):
|
||||||
mode = "w"
|
mode = "w"
|
||||||
encoding = "UTF-8"
|
encoding = mime_options.get("charset", "UTF-8")
|
||||||
body = body.decode(encoding)
|
try:
|
||||||
|
body = body.decode(encoding)
|
||||||
|
except UnicodeError:
|
||||||
|
print("Could not decode response body using %s encoding declared in header!" % encoding)
|
||||||
|
return
|
||||||
else:
|
else:
|
||||||
mode = "wb"
|
mode = "wb"
|
||||||
encoding = None
|
encoding = None
|
||||||
|
@ -390,42 +388,6 @@ Slow internet connection? Use 'set timeout' to be more patient.""")
|
||||||
self._debug("Using handler: %s" % cmd_str)
|
self._debug("Using handler: %s" % cmd_str)
|
||||||
return cmd_str
|
return cmd_str
|
||||||
|
|
||||||
def _decode_text(self, f):
|
|
||||||
# Attempt to decode some bytes into a Unicode string.
|
|
||||||
# First of all, try UTF-8 as the default.
|
|
||||||
# If this fails, attempt to autodetect the encoding if chardet
|
|
||||||
# library is installed.
|
|
||||||
# If chardet is not installed, or fails to work, fall back on
|
|
||||||
# the user-specified alternate encoding.
|
|
||||||
# If none of this works, this will raise UnicodeError and it's
|
|
||||||
# up to the caller to handle it gracefully.
|
|
||||||
raw_bytes = f.read()
|
|
||||||
# Try UTF-8 first:
|
|
||||||
try:
|
|
||||||
text = raw_bytes.decode("UTF-8")
|
|
||||||
except UnicodeError:
|
|
||||||
# If we have chardet, try the magic
|
|
||||||
self._debug("Could not decode response as UTF-8.")
|
|
||||||
if _HAS_CHARDET:
|
|
||||||
autodetect = chardet.detect(raw_bytes)
|
|
||||||
# Make sure we're vaguely certain
|
|
||||||
if autodetect["confidence"] > 0.5:
|
|
||||||
self._debug("Trying encoding %s as recommended by chardet." % autodetect["encoding"])
|
|
||||||
text = raw_bytes.decode(autodetect["encoding"])
|
|
||||||
else:
|
|
||||||
# Try the user-specified encoding
|
|
||||||
self._debug("Trying fallback encoding %s." % self.options["encoding"])
|
|
||||||
text = raw_bytes.decode(self.options["encoding"])
|
|
||||||
else:
|
|
||||||
# Try the user-specified encoding
|
|
||||||
text = raw_bytes.decode(self.options["encoding"])
|
|
||||||
if not text.endswith("\n"):
|
|
||||||
text += CRLF
|
|
||||||
new_f = io.StringIO()
|
|
||||||
new_f.write(text)
|
|
||||||
new_f.seek(0)
|
|
||||||
return new_f
|
|
||||||
|
|
||||||
def _handle_index(self, body, menu_gi):
|
def _handle_index(self, body, menu_gi):
|
||||||
self.index = []
|
self.index = []
|
||||||
if self.idx_filename:
|
if self.idx_filename:
|
||||||
|
|
Loading…
Reference in New Issue