diff --git a/etherdump/commands/common.py b/etherdump/commands/common.py index 7e08b8f..e8907ed 100644 --- a/etherdump/commands/common.py +++ b/etherdump/commands/common.py @@ -77,3 +77,34 @@ def progressbar (i, num, label="", file=sys.stderr): msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label) sys.stderr.write(msg.encode("utf-8")) sys.stderr.flush() + + + +# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities: +import re, htmlentitydefs +## +# Removes HTML or XML character references and entities from a text string. +# +# @param text The HTML (or XML) source text. +# @return The plain text, as a Unicode string, if necessary. +def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)) + else: + return unichr(int(text[2:-1])) + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + diff --git a/etherdump/commands/pull.py b/etherdump/commands/pull.py index 4f7031c..0a3c796 100644 --- a/etherdump/commands/pull.py +++ b/etherdump/commands/pull.py @@ -10,7 +10,8 @@ from time import sleep from html5tidy import html5tidy import html5lib from xml.etree import ElementTree as ET - +# debugging +# import ElementTree as ET """ pull(meta): @@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred) meta['padid'] = padid.encode("utf-8") versions = meta["versions"] = [] versions.append({ - "url": padurlbase + padid.encode("utf-8"), + "url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid; "type": "pad", "code": 200 }) @@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred) html = html['data']['html'] ver["path"] = p+".diff.html" ver["url"] = quote(ver["path"]) - doc = html5lib.parse(html, namespaceHTMLElements=False) + doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False) html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links) with open(ver["path"], "w") as f: # f.write(html.encode("utf-8")) - print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f) + print(ET.tostring(doc, method="html", encoding="utf-8"), file=f) # Process text, html, dhtml, all options if args.all or args.html: @@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred) html = html['data']['html'] ver["path"] = p+".raw.html" ver["url"] = quote(ver["path"]) - doc = html5lib.parse(html, namespaceHTMLElements=False) html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links) with open(ver["path"], "w") as f: # f.write(html.encode("utf-8")) - print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f) + print (ET.tostring(doc, method="html", encoding="utf-8"), file=f) # output meta if args.all or args.meta: