fixed vital encoding error in pad urls
This commit is contained in:
parent
d41ae5266b
commit
05536c25b3
@ -77,3 +77,34 @@ def progressbar (i, num, label="", file=sys.stderr):
|
|||||||
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
|
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
|
||||||
sys.stderr.write(msg.encode("utf-8"))
|
sys.stderr.write(msg.encode("utf-8"))
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
|
||||||
|
import re, htmlentitydefs
|
||||||
|
##
|
||||||
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
|
#
|
||||||
|
# @param text The HTML (or XML) source text.
|
||||||
|
# @return The plain text, as a Unicode string, if necessary.
|
||||||
|
def unescape(text):
|
||||||
|
def fixup(m):
|
||||||
|
text = m.group(0)
|
||||||
|
if text[:2] == "&#":
|
||||||
|
# character reference
|
||||||
|
try:
|
||||||
|
if text[:3] == "&#x":
|
||||||
|
return unichr(int(text[3:-1], 16))
|
||||||
|
else:
|
||||||
|
return unichr(int(text[2:-1]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# named entity
|
||||||
|
try:
|
||||||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return text # leave as is
|
||||||
|
return re.sub("&#?\w+;", fixup, text)
|
||||||
|
|
||||||
|
@ -10,7 +10,8 @@ from time import sleep
|
|||||||
from html5tidy import html5tidy
|
from html5tidy import html5tidy
|
||||||
import html5lib
|
import html5lib
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
# debugging
|
||||||
|
# import ElementTree as ET
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pull(meta):
|
pull(meta):
|
||||||
@ -94,7 +95,7 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
meta['padid'] = padid.encode("utf-8")
|
meta['padid'] = padid.encode("utf-8")
|
||||||
versions = meta["versions"] = []
|
versions = meta["versions"] = []
|
||||||
versions.append({
|
versions.append({
|
||||||
"url": padurlbase + padid.encode("utf-8"),
|
"url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;
|
||||||
"type": "pad",
|
"type": "pad",
|
||||||
"code": 200
|
"code": 200
|
||||||
})
|
})
|
||||||
@ -174,11 +175,11 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
html = html['data']['html']
|
html = html['data']['html']
|
||||||
ver["path"] = p+".diff.html"
|
ver["path"] = p+".diff.html"
|
||||||
ver["url"] = quote(ver["path"])
|
ver["url"] = quote(ver["path"])
|
||||||
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)
|
||||||
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||||
with open(ver["path"], "w") as f:
|
with open(ver["path"], "w") as f:
|
||||||
# f.write(html.encode("utf-8"))
|
# f.write(html.encode("utf-8"))
|
||||||
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
|
print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)
|
||||||
|
|
||||||
# Process text, html, dhtml, all options
|
# Process text, html, dhtml, all options
|
||||||
if args.all or args.html:
|
if args.all or args.html:
|
||||||
@ -190,12 +191,11 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
html = html['data']['html']
|
html = html['data']['html']
|
||||||
ver["path"] = p+".raw.html"
|
ver["path"] = p+".raw.html"
|
||||||
ver["url"] = quote(ver["path"])
|
ver["url"] = quote(ver["path"])
|
||||||
|
|
||||||
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
||||||
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||||
with open(ver["path"], "w") as f:
|
with open(ver["path"], "w") as f:
|
||||||
# f.write(html.encode("utf-8"))
|
# f.write(html.encode("utf-8"))
|
||||||
print (ET.tostring(doc, method="html", encoding="unicode").encode("utf-8"), file=f)
|
print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)
|
||||||
|
|
||||||
# output meta
|
# output meta
|
||||||
if args.all or args.meta:
|
if args.all or args.meta:
|
||||||
|
Loading…
Reference in New Issue
Block a user