From 40a4a90535d46239be4fcb5166302a2f023b5bf4 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Thu, 14 Jan 2016 18:29:34 +0100 Subject: [PATCH] pull with html5tidy and version links --- etherdump/commands/html5tidy.py | 166 ++++++++++++++++++++++++++++++++ etherdump/commands/pull.py | 51 +++++++--- 2 files changed, 202 insertions(+), 15 deletions(-) create mode 100644 etherdump/commands/html5tidy.py diff --git a/etherdump/commands/html5tidy.py b/etherdump/commands/html5tidy.py new file mode 100644 index 0000000..b1fbd32 --- /dev/null +++ b/etherdump/commands/html5tidy.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +from __future__ import print_function +from html5lib import parse +import os, sys +from argparse import ArgumentParser +from xml.etree import ElementTree as ET + + +def etree_indent(elem, level=0): + i = "\n" + level*" " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + if not elem.tail or not elem.tail.strip(): + elem.tail = i + for elem in elem: + etree_indent(elem, level+1) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + +def get_link_type (url): + lurl = url.lower() + if lurl.endswith(".html") or lurl.endswith(".htm"): + return "text/html" + elif lurl.endswith(".txt"): + return "text/plain" + elif lurl.endswith(".rss"): + return "application/rss+xml" + elif lurl.endswith(".atom"): + return "application/atom+xml" + elif lurl.endswith(".json"): + return "application/json" + elif lurl.endswith(".js") or lurl.endswith(".jsonp"): + return "text/javascript" + +def pluralize (x): + if type(x) == list or type(x) == tuple: + return x + else: + return (x,) + +def html5tidy (doc, charset="utf-8", title=None, scripts=None, links=None, indent=False): + if scripts: + script_srcs = [x.attrib.get("src") for x in doc.findall(".//script")] + for src in pluralize(scripts): + if src not in script_srcs: + script = ET.SubElement(doc.find(".//head"), "script", src=src) + script_srcs.append(src) + + if links: + existinglinks = {} + for elt in doc.findall(".//link"): + href = elt.attrib.get("href") + if href: + existinglinks[href] = elt + for link in links: + linktype = link.get("type") or get_link_type(link["href"]) + if link["href"] in existinglinks: + elt = existinglinks[link["href"]] + elt.attrib["rel"] = link["rel"] + else: + elt = ET.SubElement(doc.find(".//head"), "link", href=link["href"], rel=link["rel"]) + if linktype: + elt.attrib["type"] = linktype + if "title" in link: + elt.attrib["title"] = link["title"] + + if charset: + meta_charsets = [x.attrib.get("charset") for x in doc.findall(".//meta") if x.attrib.get("charset") != None] + if not meta_charsets: + meta = ET.SubElement(doc.find(".//head"), "meta", charset=charset) + + if title != None: + titleelt = doc.find(".//title") + if not titleelt: + titleelt = ET.SubElement(doc.find(".//head"), "title") + titleelt.text = title + + if indent: + etree_indent(doc) + return doc + + +if __name__ == "__main__": + p = ArgumentParser("") + p.add_argument("input", nargs="?", default=None) + p.add_argument("--indent", default=False, action="store_true") + p.add_argument("--mogrify", default=False, action="store_true", help="modify file in place") + p.add_argument("--method", default="html", help="method, default: html, values: html, xml, text") + p.add_argument("--output", default=None, help="") + p.add_argument("--title", default=None, help="ensure/add title tag in head") + p.add_argument("--charset", default="utf-8", help="ensure/add meta tag with charset") + p.add_argument("--script", action="append", default=[], help="ensure/add script tag") + # s, see https://www.w3.org/TR/html5/links.html#links + p.add_argument("--stylesheet", action="append", default=[], help="ensure/add style link") + p.add_argument("--alternate", action="append", default=[], nargs="+", help="ensure/add alternate links (optionally followed by a title and type)") + p.add_argument("--next", action="append", default=[], nargs="+", help="ensure/add alternate link") + p.add_argument("--prev", action="append", default=[], nargs="+", help="ensure/add alternate link") + p.add_argument("--search", action="append", default=[], nargs="+", help="ensure/add search link") + p.add_argument("--rss", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/rss+xml") + p.add_argument("--atom", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/atom+xml") + + args = p.parse_args() + links = [] + def add_links (links, items, rel, _type=None): + for href in items: + d = {} + d["rel"] = rel + if _type: + d["type"] = _type + + if type(href) == list: + if len(href) == 1: + d["href"] = href[0] + elif len(href) == 2: + d["href"] = href[0] + d["title"] = href[1] + elif len(href) == 3: + d["href"] = href[0] + d["title"] = href[1] + d["type"] = href[2] + else: + continue + else: + d["href"] = href + + links.append(d) + for rel in ("stylesheet", "alternate", "next", "prev", "search"): + add_links(links, getattr(args, rel), rel) + for item in args.rss: + add_links(links, item, rel="alternate", _type="application/rss+xml") + for item in args.atom: + add_links(links, item, rel="alternate", _type="application/atom+xml") + + # INPUT + if args.input: + fin = open(args.input) + else: + fin = sys.stdin + + doc = parse(fin, namespaceHTMLElements=False) + if fin != sys.stdin: + fin.close() + html5tidy(doc, scripts=args.script, links=links, title=args.title, indent=args.indent) + + # OUTPUT + tmppath = None + if args.output: + fout = open(args.output, "w") + elif args.mogrify: + tmppath = args.input+".tmp" + fout = open(tmppath, "w") + else: + fout = sys.stdout + + print (ET.tostring(doc, method=args.method), file=fout) + + if fout != sys.stdout: + fout.close() + + if tmppath: + os.rename(args.input, args.input+"~") + os.rename(tmppath, args.input) diff --git a/etherdump/commands/pull.py b/etherdump/commands/pull.py index 239a5ce..24580f3 100644 --- a/etherdump/commands/pull.py +++ b/etherdump/commands/pull.py @@ -7,6 +7,9 @@ from urllib import urlencode, quote from urllib2 import HTTPError from common import * from time import sleep +from html5tidy import html5tidy +import html5lib +from xml.etree import ElementTree as ET """ @@ -54,7 +57,7 @@ def main (args): for i, padid in enumerate(padids): # TODO... """ -Self-containted documents / and/or document receipts +Self-contained documents / and/or document receipts storing enough information to reconstruct (or understand an error occurred) """ @@ -136,19 +139,6 @@ storing enough information to reconstruct (or understand an error occurred) except OSError: pass - # Process text, html, dhtml, all options - if args.all or args.html: - html = getjson(info['apiurl']+'getHTML?'+urlencode(data)) - ver = {"type": "html"} - versions.append(ver) - ver["code"] = html["_code"] - if html["_code"] == 200: - html = html['data']['html'] - ver["path"] = p+".raw.html" - ver["url"] = quote(ver["path"]) - with open(ver["path"], "w") as f: - f.write(html.encode("utf-8")) - if args.all or args.text: text = getjson(info['apiurl']+'getText?'+urlencode(data)) ver = {"type": "text"} @@ -163,6 +153,17 @@ storing enough information to reconstruct (or understand an error occurred) # once the content is settled, compute a hash # and link it in the metadata! + links = [] + links.append({"href":"../styles.css", "rel":"stylesheet"}) + # todo, make this process reflect which files actually were made + versionbaseurl = quote(padid.encode("utf-8")) + links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"}) + links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"}) + links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"}) + links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"}) + links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"}) + links.append({"href":"../", "rel":"search", "type":"text/html", "title":"Index"}) + if args.all or args.dhtml: data['startRev'] = "0" html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data)) @@ -173,8 +174,28 @@ storing enough information to reconstruct (or understand an error occurred) html = html['data']['html'] ver["path"] = p+".diff.html" ver["url"] = quote(ver["path"]) + doc = html5lib.parse(html, namespaceHTMLElements=False) + html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links) + with open(ver["path"], "w") as f: + # f.write(html.encode("utf-8")) + print (ET.tostring(doc, method="html"), file=f) + + # Process text, html, dhtml, all options + if args.all or args.html: + html = getjson(info['apiurl']+'getHTML?'+urlencode(data)) + ver = {"type": "html"} + versions.append(ver) + ver["code"] = html["_code"] + if html["_code"] == 200: + html = html['data']['html'] + ver["path"] = p+".raw.html" + ver["url"] = quote(ver["path"]) + + doc = html5lib.parse(html, namespaceHTMLElements=False) + html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links) with open(ver["path"], "w") as f: - f.write(html.encode("utf-8")) + # f.write(html.encode("utf-8")) + print (ET.tostring(doc, method="html"), file=f) # output meta if args.all or args.meta: