fixed vital encoding error in pad urls
This commit is contained in:
parent
d41ae5266b
commit
05536c25b3
@ -77,3 +77,34 @@ def progressbar (i, num, label="", file=sys.stderr):
|
||||
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
|
||||
sys.stderr.write(msg.encode("utf-8"))
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
|
||||
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
|
||||
import re, htmlentitydefs
|
||||
##
|
||||
# Removes HTML or XML character references and entities from a text string.
|
||||
#
|
||||
# @param text The HTML (or XML) source text.
|
||||
# @return The plain text, as a Unicode string, if necessary.
|
||||
def unescape(text):
|
||||
def fixup(m):
|
||||
text = m.group(0)
|
||||
if text[:2] == "&#":
|
||||
# character reference
|
||||
try:
|
||||
if text[:3] == "&#x":
|
||||
return unichr(int(text[3:-1], 16))
|
||||
else:
|
||||
return unichr(int(text[2:-1]))
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
# named entity
|
||||
try:
|
||||
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||
except KeyError:
|
||||
pass
|
||||
return text # leave as is
|
||||
return re.sub("&#?\w+;", fixup, text)
|
||||
|
||||
|
@ -10,7 +10,8 @@ from time import sleep
|
||||
from html5tidy import html5tidy
|
||||
import html5lib
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
# debugging
|
||||
# import ElementTree as ET
|
||||
|
||||
"""
|
||||
pull(meta):
|
||||
@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred)
|
||||
meta['padid'] = padid.encode("utf-8")
|
||||
versions = meta["versions"] = []
|
||||
versions.append({
|
||||
"url": padurlbase + padid.encode("utf-8"),
|
||||
"url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;
|
||||
"type": "pad",
|
||||
"code": 200
|
||||
})
|
||||
@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred)
|
||||
html = html['data']['html']
|
||||
ver["path"] = p+".diff.html"
|
||||
ver["url"] = quote(ver["path"])
|
||||
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
||||
doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)
|
||||
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||
with open(ver["path"], "w") as f:
|
||||
# f.write(html.encode("utf-8"))
|
||||
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
|
||||
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)
|
||||
|
||||
# Process text, html, dhtml, all options
|
||||
if args.all or args.html:
|
||||
@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred)
|
||||
html = html['data']['html']
|
||||
ver["path"] = p+".raw.html"
|
||||
ver["url"] = quote(ver["path"])
|
||||
|
||||
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
||||
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||
with open(ver["path"], "w") as f:
|
||||
# f.write(html.encode("utf-8"))
|
||||
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
|
||||
print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)
|
||||
|
||||
# output meta
|
||||
if args.all or args.meta:
|
||||
|
Loading…
Reference in New Issue
Block a user