pull with html5tidy and version links

9 years ago · 40a4a90535
2 changed files with 202 additions and 15 deletions
--- a/etherdump/commands/html5tidy.py
+++ b/etherdump/commands/html5tidy.py
@ -0,0 +1,166 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from html5lib import parse
+import os, sys
+from argparse import ArgumentParser
+from xml.etree import ElementTree as ET 
+
+
+def etree_indent(elem, level=0):
+    i = "\n" + level*"  "
+    if len(elem):
+        if not elem.text or not elem.text.strip():
+            elem.text = i + "  "
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = i
+        for elem in elem:
+            etree_indent(elem, level+1)
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = i
+    else:
+        if level and (not elem.tail or not elem.tail.strip()):
+            elem.tail = i
+
+def get_link_type (url):
+    lurl = url.lower()
+    if lurl.endswith(".html") or lurl.endswith(".htm"):
+        return "text/html"
+    elif lurl.endswith(".txt"):
+        return "text/plain"
+    elif lurl.endswith(".rss"):
+        return "application/rss+xml"
+    elif lurl.endswith(".atom"):
+        return "application/atom+xml"
+    elif lurl.endswith(".json"):
+        return "application/json"
+    elif lurl.endswith(".js") or lurl.endswith(".jsonp"):
+        return "text/javascript"
+
+def pluralize (x):
+    if type(x) == list or type(x) == tuple:
+        return x
+    else:
+        return (x,)
+
+def html5tidy (doc, charset="utf-8", title=None, scripts=None, links=None, indent=False):
+    if scripts:
+        script_srcs = [x.attrib.get("src") for x in doc.findall(".//script")]
+        for src in pluralize(scripts):
+            if src not in script_srcs:
+                script = ET.SubElement(doc.find(".//head"), "script", src=src)
+                script_srcs.append(src)
+
+    if links:
+        existinglinks = {}
+        for elt in doc.findall(".//link"):
+            href = elt.attrib.get("href")
+            if href:
+                existinglinks[href] = elt  
+        for link in links:
+            linktype = link.get("type") or get_link_type(link["href"])
+            if link["href"] in existinglinks:
+                elt = existinglinks[link["href"]]
+                elt.attrib["rel"] = link["rel"]
+            else:
+                elt = ET.SubElement(doc.find(".//head"), "link", href=link["href"], rel=link["rel"])
+            if linktype:
+                elt.attrib["type"] = linktype            
+            if "title" in link:
+                elt.attrib["title"] = link["title"]
+
+    if charset:
+        meta_charsets = [x.attrib.get("charset") for x in doc.findall(".//meta") if x.attrib.get("charset") != None]
+        if not meta_charsets:
+            meta = ET.SubElement(doc.find(".//head"), "meta", charset=charset)
+
+    if title != None:
+        titleelt = doc.find(".//title")
+        if not titleelt:
+            titleelt = ET.SubElement(doc.find(".//head"), "title")
+        titleelt.text = title
+            
+    if indent:
+        etree_indent(doc)
+    return doc
+
+
+if __name__ == "__main__": 
+    p = ArgumentParser("")
+    p.add_argument("input", nargs="?", default=None)
+    p.add_argument("--indent", default=False, action="store_true")
+    p.add_argument("--mogrify", default=False, action="store_true", help="modify file in place")
+    p.add_argument("--method", default="html", help="method, default: html, values: html, xml, text")
+    p.add_argument("--output", default=None, help="")
+    p.add_argument("--title", default=None, help="ensure/add title tag in head")
+    p.add_argument("--charset", default="utf-8", help="ensure/add meta tag with charset")
+    p.add_argument("--script", action="append", default=[], help="ensure/add script tag")
+    # <link>s, see https://www.w3.org/TR/html5/links.html#links
+    p.add_argument("--stylesheet", action="append", default=[], help="ensure/add style link")
+    p.add_argument("--alternate", action="append", default=[], nargs="+", help="ensure/add alternate links (optionally followed by a title and type)")
+    p.add_argument("--next", action="append", default=[], nargs="+", help="ensure/add alternate link")
+    p.add_argument("--prev", action="append", default=[], nargs="+", help="ensure/add alternate link")
+    p.add_argument("--search", action="append", default=[], nargs="+", help="ensure/add search link")
+    p.add_argument("--rss", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/rss+xml")
+    p.add_argument("--atom", action="append", default=[], nargs="+", help="ensure/add alternate link of type application/atom+xml")
+
+    args = p.parse_args()
+    links = []
+    def add_links (links, items, rel, _type=None):
+        for href in items:
+            d = {}
+            d["rel"] = rel
+            if _type:
+                d["type"] = _type
+
+            if type(href) == list:
+                if len(href) == 1:
+                    d["href"] = href[0]
+                elif len(href) == 2:
+                    d["href"] = href[0]
+                    d["title"] = href[1]
+                elif len(href) == 3:
+                    d["href"] = href[0]
+                    d["title"] = href[1]
+                    d["type"] = href[2]
+                else:
+                    continue
+            else:
+                d["href"] = href
+
+            links.append(d)
+    for rel in ("stylesheet", "alternate", "next", "prev", "search"):
+        add_links(links, getattr(args, rel), rel)
+    for item in args.rss:
+        add_links(links, item, rel="alternate", _type="application/rss+xml")
+    for item in args.atom:
+        add_links(links, item, rel="alternate", _type="application/atom+xml")
+
+    # INPUT
+    if args.input:
+        fin = open(args.input)
+    else:
+        fin = sys.stdin
+
+    doc = parse(fin, namespaceHTMLElements=False)
+    if fin != sys.stdin:
+        fin.close()
+    html5tidy(doc, scripts=args.script, links=links, title=args.title, indent=args.indent)
+
+    # OUTPUT
+    tmppath = None
+    if args.output:
+        fout = open(args.output, "w")
+    elif args.mogrify:
+        tmppath = args.input+".tmp"
+        fout = open(tmppath, "w")
+    else:
+        fout = sys.stdout
+
+    print (ET.tostring(doc, method=args.method), file=fout)
+
+    if fout != sys.stdout:
+        fout.close()
+
+    if tmppath:
+        os.rename(args.input, args.input+"~")
+        os.rename(tmppath, args.input)
--- a/etherdump/commands/pull.py
+++ b/etherdump/commands/pull.py
@ -7,6 +7,9 @@ from urllib import urlencode, quote
 from urllib2 import HTTPError
 from common import *
 from time import sleep
+from html5tidy import html5tidy
+import html5lib
+from xml.etree import ElementTree as ET 


 """
@ -54,7 +57,7 @@ def main (args):
    for i, padid in enumerate(padids):
        # TODO...
        """        
-Self-containted documents / and/or document receipts
+Self-contained documents / and/or document receipts
 storing enough information to reconstruct (or understand an error occurred)
    """

@ -136,19 +139,6 @@ storing enough information to reconstruct (or understand an error occurred)
            except OSError:
                pass

-        # Process text, html, dhtml, all options
-        if args.all or args.html:
-            html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
-            ver = {"type": "html"}
-            versions.append(ver)
-            ver["code"] = html["_code"]
-            if html["_code"] == 200:
-                html = html['data']['html']
-                ver["path"] = p+".raw.html"
-                ver["url"] = quote(ver["path"])
-                with open(ver["path"], "w") as f:
-                    f.write(html.encode("utf-8"))
-
        if args.all or args.text:
            text = getjson(info['apiurl']+'getText?'+urlencode(data))
            ver = {"type": "text"}
@ -163,6 +153,17 @@ storing enough information to reconstruct (or understand an error occurred)
                # once the content is settled, compute a hash
                # and link it in the metadata!

+        links = []
+        links.append({"href":"../styles.css", "rel":"stylesheet"})
+        # todo, make this process reflect which files actually were made
+        versionbaseurl = quote(padid.encode("utf-8"))
+        links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})
+        links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"})
+        links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})
+        links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})
+        links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})
+        links.append({"href":"../", "rel":"search", "type":"text/html", "title":"Index"})
+
        if args.all or args.dhtml:
            data['startRev'] = "0"
            html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
@ -173,8 +174,28 @@ storing enough information to reconstruct (or understand an error occurred)
                html = html['data']['html']
                ver["path"] = p+".diff.html"
                ver["url"] = quote(ver["path"])
+                doc = html5lib.parse(html, namespaceHTMLElements=False)
+                html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
+                with open(ver["path"], "w") as f:
+                    # f.write(html.encode("utf-8"))
+                    print (ET.tostring(doc, method="html"), file=f)
+
+        # Process text, html, dhtml, all options
+        if args.all or args.html:
+            html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
+            ver = {"type": "html"}
+            versions.append(ver)
+            ver["code"] = html["_code"]
+            if html["_code"] == 200:
+                html = html['data']['html']
+                ver["path"] = p+".raw.html"
+                ver["url"] = quote(ver["path"])
+
+                doc = html5lib.parse(html, namespaceHTMLElements=False)
+                html5tidy(doc, indent=True, title=padid, scripts="../versions.js", links=links)
                with open(ver["path"], "w") as f:
-                    f.write(html.encode("utf-8"))
+                    # f.write(html.encode("utf-8"))
+                    print (ET.tostring(doc, method="html"), file=f)

        # output meta
        if args.all or args.meta: