fixed vital encoding error in pad urls

2016-01-15 12:06:21 +01:00 · 2016-01-15 12:06:21 +01:00 · 05536c25b3
commit 05536c25b3
parent d41ae5266b
2 changed files with 37 additions and 6 deletions
--- a/etherdump/commands/common.py
+++ b/etherdump/commands/common.py
@ -77,3 +77,34 @@ def progressbar (i, num, label="", file=sys.stderr):
    msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
    sys.stderr.write(msg.encode("utf-8"))
    sys.stderr.flush()
+
+
+
+# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
+import re, htmlentitydefs
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+def unescape(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
--- a/etherdump/commands/pull.py
+++ b/etherdump/commands/pull.py
@ -10,7 +10,8 @@ from time import sleep
 from html5tidy import html5tidy
 import html5lib
 from xml.etree import ElementTree as ET 
-
+# debugging
+# import ElementTree as ET 

 """
 pull(meta):
@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred)
                meta['padid'] = padid.encode("utf-8")
                versions = meta["versions"] = []
                versions.append({
-                    "url": padurlbase + padid.encode("utf-8"),
+                    "url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;
                    "type": "pad",
                    "code": 200
                })
@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred)
                html = html['data']['html']
                ver["path"] = p+".diff.html"
                ver["url"] = quote(ver["path"])
-                doc = html5lib.parse(html, namespaceHTMLElements=False)
+                doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)
                html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
                with open(ver["path"], "w") as f:
                    # f.write(html.encode("utf-8"))
-                    print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
+                    print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)

        # Process text, html, dhtml, all options
        if args.all or args.html:
@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred)
                html = html['data']['html']
                ver["path"] = p+".raw.html"
                ver["url"] = quote(ver["path"])
-
                doc = html5lib.parse(html, namespaceHTMLElements=False)
                html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
                with open(ver["path"], "w") as f:
                    # f.write(html.encode("utf-8"))
-                    print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
+                    print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)

        # output meta
        if args.all or args.meta: