dump html with trim and linkify

2015-07-23 18:09:20 +02:00 · 2015-07-23 18:09:20 +02:00 · 76cb1b28a1
commit 76cb1b28a1
parent f175dfc591
3 changed files with 152 additions and 0 deletions
--- a/dump_html.py
+++ b/dump_html.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from argparse import ArgumentParser
+import json, sys, os
+from urllib import urlencode
+from urllib2 import urlopen, HTTPError, URLError
+from xml.etree import cElementTree as ET 
+import html5lib
+from trim import trim_removed_spans, contents
+from linkify import linkify, urlify
+
+
+p = ArgumentParser("")
+p.add_argument("padid", help="the padid")
+p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+p.add_argument("--path", default="output", help="path to save files, default: output")
+p.add_argument("--verbose", default=False, action="store_true")
+p.add_argument("--limit", type=int, default=None)
+args = p.parse_args()
+
+with open(args.padinfo) as f:
+    info = json.load(f)
+apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+
+todo = [args.padid]
+done = set()
+count = 0
+
+while len(todo) > 0:
+    padid = todo[0]
+    todo = todo[1:]
+    done.add(padid)
+
+    data = {}
+    data['apikey'] = info['apikey']
+    data['padID'] = padid.encode("utf-8")
+
+    out = "{0}/{1}".format(args.path, urlify(padid))
+    print ("{0}".format(out), file=sys.stderr)
+
+    total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
+    total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
+    if args.verbose:
+        print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
+
+    data['startRev'] = "0"
+    requesturl = apiurl+'createDiffHTML?'+urlencode(data)
+    html = json.load(urlopen(requesturl))['data']['html']
+    t = html5lib.parse(html, namespaceHTMLElements=False)
+    trim_removed_spans(t)
+    html = ET.tostring(t, method="html")
+
+    html, links = linkify(html)
+    for l in links:
+        if l not in todo and l not in done:
+            if args.verbose:
+                print ("  link: {0}".format(l), file=sys.stderr)
+            todo.append(l)
+
+    try:
+        os.makedirs(args.path)
+    except OSError:
+        pass
+    with open(out, "w") as f:
+        f.write(html.encode("utf-8"))
+
+    count += 1
+    if args.limit and count >= args.limit:
+        break
--- a/linkify.py
+++ b/linkify.py
@ -0,0 +1,29 @@
+from __future__ import print_function
+import re, sys
+
+
+def urlify (t):
+	return t.replace(" ", "_") + ".html"
+
+def linkify (src, urlify=urlify):
+
+	collect = []
+
+	def s (m):
+		contents = m.group(1)
+		collect.append(contents)
+		link = urlify(contents)
+		return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
+
+	src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
+	return (src, collect)
+
+
+if __name__ == "__main__":
+	src = sys.stdin.read()
+	src, links = linkify(src)
+
+	for l in links:
+		print (l)
+
+	print (src)
--- a/trim.py
+++ b/trim.py
@ -0,0 +1,54 @@
+from __future__ import print_function
+import html5lib, sys, re
+from xml.etree import cElementTree as ET
+
+
+def contents (element, method="html"):
+    return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
+
+def iterparent(tree):
+    for parent in tree.iter():
+        for child in parent:
+            yield parent, child
+
+def get_parent(tree, elt):
+    for parent in tree.iter():
+        for child in parent:
+            if child == elt:
+                return parent
+
+def remove_recursive (tree, elt):
+    """ Remove element and (any resulting) empty containing elements """
+    p = get_parent(tree, elt)
+    if p:
+        p.remove(elt)
+        if len(p) == 0 and (p.text == None or p.text.strip() == ""):
+            # print ("empty parent", p, file=sys.stderr)
+            remove_recursive(tree, p)
+
+
+def trim_removed_spans (t):
+    # remove <span class="removed"> and empty parents
+    for n in t.findall(".//span[@class='removed']"):
+       remove_recursive(t, n)
+    # then strip any leading br's from body
+    while True:
+        tag = t.find("./body")[0]
+        if tag.tag == "br":
+            remove_recursive(t, tag)
+        else:
+            break
+
+def trim_removed_spans_src (src):
+    t = html5lib.parse(src, namespaceHTMLElements=False)
+    trim_removed_spans(t)
+    return contents(t.find("./body"))    
+
+
+if __name__ == "__main__":
+    src = sys.stdin.read()
+    # t = html5lib.parse(src, namespaceHTMLElements=False)
+    # trim_rems_tree(t)
+    # print (ET.tostring(t))
+    print (trim_removed_spans_src(src).encode("utf-8"))
+