fixed vital encoding error in pad urls

This commit is contained in:
Michael Murtaugh 2016-01-15 12:06:21 +01:00
parent d41ae5266b
commit 05536c25b3
2 changed files with 37 additions and 6 deletions

View File

@ -77,3 +77,34 @@ def progressbar (i, num, label="", file=sys.stderr):
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
sys.stderr.write(msg.encode("utf-8"))
sys.stderr.flush()
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
import re, htmlentitydefs
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)

View File

@ -10,7 +10,8 @@ from time import sleep
from html5tidy import html5tidy
import html5lib
from xml.etree import ElementTree as ET
# debugging
# import ElementTree as ET
"""
pull(meta):
@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred)
meta['padid'] = padid.encode("utf-8")
versions = meta["versions"] = []
versions.append({
"url": padurlbase + padid.encode("utf-8"),
"url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;
"type": "pad",
"code": 200
})
@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred)
html = html['data']['html']
ver["path"] = p+".diff.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(html, namespaceHTMLElements=False)
doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
with open(ver["path"], "w") as f:
# f.write(html.encode("utf-8"))
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)
# Process text, html, dhtml, all options
if args.all or args.html:
@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred)
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(html, namespaceHTMLElements=False)
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
with open(ver["path"], "w") as f:
# f.write(html.encode("utf-8"))
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)
# output meta
if args.all or args.meta: