From 93ac6340de55a4df948abef4fee2df7bf171b3bc Mon Sep 17 00:00:00 2001
From: Ben Harris <ben@tilde.team>
Date: Tue, 22 Sep 2020 13:20:20 -0400
Subject: [PATCH] switch to lxml in order to detect CDATA tags

---
 gen_stats | 52 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/gen_stats b/gen_stats
index 98c4ef6..28d9916 100755
--- a/gen_stats
+++ b/gen_stats
@@ -1,14 +1,15 @@
 #!/usr/bin/env python3
-import requests, json
-import xml.etree.ElementTree as xml
+import base64
+import json
+from lxml import etree
 from urllib.parse import quote
 
 WORK_DIR = "/var/www/tilde.chat"
-r = requests.get("http://localhost:8081/stats")
-r.raise_for_status()
 out = {}
-d = xml.fromstring(r.text)
-assert d.tag == "inspircdstats"
+parser = etree.XMLParser(strip_cdata=False)
+root = etree.parse("http://localhost:8081/stats", parser)
+
+assert root.getroot().tag == "inspircdstats"
 
 with open(f"{WORK_DIR}/blacklist", "r") as f:
     BLACKLIST = f.read().splitlines()
@@ -16,39 +17,48 @@ with open(f"{WORK_DIR}/blacklist", "r") as f:
 
 def define(name, xps, vfilter=lambda x: x):
     global out
-    out[name] = vfilter(d.findall(xps)[0].text)
+    out[name] = vfilter(root.findall(xps)[0].text)
+
+
+def unsanitize(node, default=""):
+    # workaround for weird behavior in insp's xml output
+    # https://github.com/inspircd/inspircd/blob/v3.7.0/src/modules/m_httpd_stats.cpp#L55
+    if node.text is None or node.text == "":
+        return default
+    elif str(etree.tostring(node)).startswith(f"b'<{node.tag}><![CDATA["):
+        missing_padding = len(node.text) % 4
+        if missing_padding:
+            v = node.text + "=" * (4 - missing_padding)
+        else:
+            v = node.text
+        return base64.b64decode(v).decode("utf-8")
+
+    return node.text
 
 
 define("usercount", "./general/usercount", int)
 define("channelcount", "./general/channelcount", int)
-schannels = d.findall("./channellist/channel")
+schannels = root.findall("./channellist/channel")
 
 channels = []
 for schannel in schannels:
-    channel = dict(
-        name=schannel.findall("channelname")[0].text,
-        usercount=int(schannel.findall("usercount")[0].text),
-    )
-    channel["topic"] = (
-        schannel.findall("./channeltopic/topictext")[0].text
-        if schannel.findall("./channeltopic/topictext")[0].text is not None
-        else "No topic set"
-    )
+    channel = {}
+    channel["name"] = unsanitize(schannel.find("channelname"))
+    channel["topic"] = unsanitize(schannel.find("./channeltopic/topictext"))
+    channel["usercount"] = int(schannel.find("usercount").text)
+    channel["webchatlink"] = "https://web.tilde.chat/?join=" + quote(channel["name"])
 
     if (
         # skip channels in the blacklist or with mode +s
-        "s" in schannel.findall("./channelmodes")[0].text.split()[0]
+        "s" in schannel.find("./channelmodes").text.split()[0]
         or channel["name"] in BLACKLIST
     ):
         continue
 
-    channel["webchatlink"] = "https://web.tilde.chat/?join=" + quote(channel["name"])
-
     channels.append(channel)
 
 channels.sort(key=lambda x: x["name"].lower())
 out["channels"] = channels
 
-# print([x.text for x in d.findall("./channellist/channel/channeltopic/topictext")])
 with open(f"{WORK_DIR}/stats.json", "w") as f:
     json.dump(out, f)