pull with html5tidy and version links
This commit is contained in:
parent
3ee4b8f77c
commit
40a4a90535
166
etherdump/commands/html5tidy.py
Normal file
166
etherdump/commands/html5tidy.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
from __future__ import print_function
|
||||||
|
from html5lib import parse
|
||||||
|
import os, sys
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
def etree_indent(elem, level=0):
|
||||||
|
i = "\n" + level*" "
|
||||||
|
if len(elem):
|
||||||
|
if not elem.text or not elem.text.strip():
|
||||||
|
elem.text = i + " "
|
||||||
|
if not elem.tail or not elem.tail.strip():
|
||||||
|
elem.tail = i
|
||||||
|
for elem in elem:
|
||||||
|
etree_indent(elem, level+1)
|
||||||
|
if not elem.tail or not elem.tail.strip():
|
||||||
|
elem.tail = i
|
||||||
|
else:
|
||||||
|
if level and (not elem.tail or not elem.tail.strip()):
|
||||||
|
elem.tail = i
|
||||||
|
|
||||||
|
def get_link_type (url):
|
||||||
|
lurl = url.lower()
|
||||||
|
if lurl.endswith(".html") or lurl.endswith(".htm"):
|
||||||
|
return "text/html"
|
||||||
|
elif lurl.endswith(".txt"):
|
||||||
|
return "text/plain"
|
||||||
|
elif lurl.endswith(".rss"):
|
||||||
|
return "application/rss+xml"
|
||||||
|
elif lurl.endswith(".atom"):
|
||||||
|
return "application/atom+xml"
|
||||||
|
elif lurl.endswith(".json"):
|
||||||
|
return "application/json"
|
||||||
|
elif lurl.endswith(".js") or lurl.endswith(".jsonp"):
|
||||||
|
return "text/javascript"
|
||||||
|
|
||||||
|
def pluralize (x):
|
||||||
|
if type(x) == list or type(x) == tuple:
|
||||||
|
return x
|
||||||
|
else:
|
||||||
|
return (x,)
|
||||||
|
|
||||||
|
def html5tidy (doc, charset="utf-8", title=None, scripts=None, links=None, indent=False):
|
||||||
|
if scripts:
|
||||||
|
script_srcs = [x.attrib.get("src") for x in doc.findall(".//script")]
|
||||||
|
for src in pluralize(scripts):
|
||||||
|
if src not in script_srcs:
|
||||||
|
script = ET.SubElement(doc.find(".//head"), "script", src=src)
|
||||||
|
script_srcs.append(src)
|
||||||
|
|
||||||
|
if links:
|
||||||
|
existinglinks = {}
|
||||||
|
for elt in doc.findall(".//link"):
|
||||||
|
href = elt.attrib.get("href")
|
||||||
|
if href:
|
||||||
|
existinglinks[href] = elt
|
||||||
|
for link in links:
|
||||||
|
linktype = link.get("type") or get_link_type(link["href"])
|
||||||
|
if link["href"] in existinglinks:
|
||||||
|
elt = existinglinks[link["href"]]
|
||||||
|
elt.attrib["rel"] = link["rel"]
|
||||||
|
else:
|
||||||
|
elt = ET.SubElement(doc.find(".//head"), "link", href=link["href"], rel=link["rel"])
|
||||||
|
if linktype:
|
||||||
|
elt.attrib["type"] = linktype
|
||||||
|
if "title" in link:
|
||||||
|
elt.attrib["title"] = link["title"]
|
||||||
|
|
||||||
|
if charset:
|
||||||
|
meta_charsets = [x.attrib.get("charset") for x in doc.findall(".//meta") if x.attrib.get("charset") != None]
|
||||||
|
if not meta_charsets:
|
||||||
|
meta = ET.SubElement(doc.find(".//head"), "meta", charset=charset)
|
||||||
|
|
||||||
|
if title != None:
|
||||||
|
titleelt = doc.find(".//title")
|
||||||
|
if not titleelt:
|
||||||
|
titleelt = ET.SubElement(doc.find(".//head"), "title")
|
||||||
|
titleelt.text = title
|
||||||
|
|
||||||
|
if indent:
|
||||||
|
etree_indent(doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
p = ArgumentParser("")
|
||||||
|
p.add_argument("input", nargs="?", default=None)
|
||||||
|
p.add_argument("--indent", default=False, action="store_true")
|
||||||
|
p.add_argument("--mogrify", default=False, action="store_true", help="modify file in place")
|
||||||
|
p.add_argument("--method", default="html", help="method, default: html, values: html, xml, text")
|
||||||
|
p.add_argument("--output", default=None, help="")
|
||||||
|
p.add_argument("--title", default=None, help="ensure/add title tag in head")
|
||||||
|
p.add_argument("--charset", default="utf-8", help="ensure/add meta tag with charset")
|
||||||
|
p.add_argument("--script", action="append", default=[], help="ensure/add script tag")
|
||||||
|
# <link>s, see https://www.w3.org/TR/html5/links.html#links
|
||||||
|
p.add_argument("--stylesheet", action="append", default=[], help="ensure/add style link")
|
||||||
|
p.add_argument("--alternate", action="append", default=[], nargs="+", help="ensure/add alternate links (optionally followed by a title and type)")
|
||||||
|
p.add_argument("--next", action="append", default=[], nargs="+", help="ensure/add alternate link")
|
||||||
|
p.add_argument("--prev", action="append", default=[], nargs="+", help="ensure/add alternate link")
|
||||||
|
p.add_argument("--search", action="append", default=[], nargs="+", help="ensure/add search link")
|
||||||
|
p.add_argument("--rss", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/rss+xml")
|
||||||
|
p.add_argument("--atom", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/atom+xml")
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
|
links = []
|
||||||
|
def add_links (links, items, rel, _type=None):
|
||||||
|
for href in items:
|
||||||
|
d = {}
|
||||||
|
d["rel"] = rel
|
||||||
|
if _type:
|
||||||
|
d["type"] = _type
|
||||||
|
|
||||||
|
if type(href) == list:
|
||||||
|
if len(href) == 1:
|
||||||
|
d["href"] = href[0]
|
||||||
|
elif len(href) == 2:
|
||||||
|
d["href"] = href[0]
|
||||||
|
d["title"] = href[1]
|
||||||
|
elif len(href) == 3:
|
||||||
|
d["href"] = href[0]
|
||||||
|
d["title"] = href[1]
|
||||||
|
d["type"] = href[2]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
d["href"] = href
|
||||||
|
|
||||||
|
links.append(d)
|
||||||
|
for rel in ("stylesheet", "alternate", "next", "prev", "search"):
|
||||||
|
add_links(links, getattr(args, rel), rel)
|
||||||
|
for item in args.rss:
|
||||||
|
add_links(links, item, rel="alternate", _type="application/rss+xml")
|
||||||
|
for item in args.atom:
|
||||||
|
add_links(links, item, rel="alternate", _type="application/atom+xml")
|
||||||
|
|
||||||
|
# INPUT
|
||||||
|
if args.input:
|
||||||
|
fin = open(args.input)
|
||||||
|
else:
|
||||||
|
fin = sys.stdin
|
||||||
|
|
||||||
|
doc = parse(fin, namespaceHTMLElements=False)
|
||||||
|
if fin != sys.stdin:
|
||||||
|
fin.close()
|
||||||
|
html5tidy(doc, scripts=args.script, links=links, title=args.title, indent=args.indent)
|
||||||
|
|
||||||
|
# OUTPUT
|
||||||
|
tmppath = None
|
||||||
|
if args.output:
|
||||||
|
fout = open(args.output, "w")
|
||||||
|
elif args.mogrify:
|
||||||
|
tmppath = args.input+".tmp"
|
||||||
|
fout = open(tmppath, "w")
|
||||||
|
else:
|
||||||
|
fout = sys.stdout
|
||||||
|
|
||||||
|
print (ET.tostring(doc, method=args.method), file=fout)
|
||||||
|
|
||||||
|
if fout != sys.stdout:
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
if tmppath:
|
||||||
|
os.rename(args.input, args.input+"~")
|
||||||
|
os.rename(tmppath, args.input)
|
@ -7,6 +7,9 @@ from urllib import urlencode, quote
|
|||||||
from urllib2 import HTTPError
|
from urllib2 import HTTPError
|
||||||
from common import *
|
from common import *
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from html5tidy import html5tidy
|
||||||
|
import html5lib
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -54,7 +57,7 @@ def main (args):
|
|||||||
for i, padid in enumerate(padids):
|
for i, padid in enumerate(padids):
|
||||||
# TODO...
|
# TODO...
|
||||||
"""
|
"""
|
||||||
Self-containted documents / and/or document receipts
|
Self-contained documents / and/or document receipts
|
||||||
storing enough information to reconstruct (or understand an error occurred)
|
storing enough information to reconstruct (or understand an error occurred)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -136,19 +139,6 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Process text, html, dhtml, all options
|
|
||||||
if args.all or args.html:
|
|
||||||
html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
|
|
||||||
ver = {"type": "html"}
|
|
||||||
versions.append(ver)
|
|
||||||
ver["code"] = html["_code"]
|
|
||||||
if html["_code"] == 200:
|
|
||||||
html = html['data']['html']
|
|
||||||
ver["path"] = p+".raw.html"
|
|
||||||
ver["url"] = quote(ver["path"])
|
|
||||||
with open(ver["path"], "w") as f:
|
|
||||||
f.write(html.encode("utf-8"))
|
|
||||||
|
|
||||||
if args.all or args.text:
|
if args.all or args.text:
|
||||||
text = getjson(info['apiurl']+'getText?'+urlencode(data))
|
text = getjson(info['apiurl']+'getText?'+urlencode(data))
|
||||||
ver = {"type": "text"}
|
ver = {"type": "text"}
|
||||||
@ -163,6 +153,17 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
# once the content is settled, compute a hash
|
# once the content is settled, compute a hash
|
||||||
# and link it in the metadata!
|
# and link it in the metadata!
|
||||||
|
|
||||||
|
links = []
|
||||||
|
links.append({"href":"../styles.css", "rel":"stylesheet"})
|
||||||
|
# todo, make this process reflect which files actually were made
|
||||||
|
versionbaseurl = quote(padid.encode("utf-8"))
|
||||||
|
links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})
|
||||||
|
links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"})
|
||||||
|
links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})
|
||||||
|
links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})
|
||||||
|
links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})
|
||||||
|
links.append({"href":"../", "rel":"search", "type":"text/html", "title":"Index"})
|
||||||
|
|
||||||
if args.all or args.dhtml:
|
if args.all or args.dhtml:
|
||||||
data['startRev'] = "0"
|
data['startRev'] = "0"
|
||||||
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
|
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
|
||||||
@ -173,8 +174,28 @@ storing enough information to reconstruct (or understand an error occurred)
|
|||||||
html = html['data']['html']
|
html = html['data']['html']
|
||||||
ver["path"] = p+".diff.html"
|
ver["path"] = p+".diff.html"
|
||||||
ver["url"] = quote(ver["path"])
|
ver["url"] = quote(ver["path"])
|
||||||
|
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
||||||
|
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||||
with open(ver["path"], "w") as f:
|
with open(ver["path"], "w") as f:
|
||||||
f.write(html.encode("utf-8"))
|
# f.write(html.encode("utf-8"))
|
||||||
|
print (ET.tostring(doc, method="html"), file=f)
|
||||||
|
|
||||||
|
# Process text, html, dhtml, all options
|
||||||
|
if args.all or args.html:
|
||||||
|
html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
|
||||||
|
ver = {"type": "html"}
|
||||||
|
versions.append(ver)
|
||||||
|
ver["code"] = html["_code"]
|
||||||
|
if html["_code"] == 200:
|
||||||
|
html = html['data']['html']
|
||||||
|
ver["path"] = p+".raw.html"
|
||||||
|
ver["url"] = quote(ver["path"])
|
||||||
|
|
||||||
|
doc = html5lib.parse(html, namespaceHTMLElements=False)
|
||||||
|
html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
|
||||||
|
with open(ver["path"], "w") as f:
|
||||||
|
# f.write(html.encode("utf-8"))
|
||||||
|
print (ET.tostring(doc, method="html"), file=f)
|
||||||
|
|
||||||
# output meta
|
# output meta
|
||||||
if args.all or args.meta:
|
if args.all or args.meta:
|
||||||
|
Loading…
Reference in New Issue
Block a user