dump html with trim and linkify

This commit is contained in:
Michael Murtaugh 2015-07-23 18:09:20 +02:00
parent f175dfc591
commit 76cb1b28a1
3 changed files with 152 additions and 0 deletions

dump_html.py Executable file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
import html5lib
from trim import trim_removed_spans, contents
from linkify import linkify, urlify
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
todo = [args.padid]
done = set()
count = 0
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
html = ET.tostring(t, method="html")
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
except OSError:
with open(out, "w") as f:
count += 1
if args.limit and count >= args.limit:

linkify.py Normal file
View File

@ -0,0 +1,29 @@
from __future__ import print_function
import re, sys
def urlify (t):
return t.replace(" ", "_") + ".html"
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = m.group(1)
link = urlify(contents)
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
return (src, collect)
if __name__ == "__main__":
src = sys.stdin.read()
src, links = linkify(src)
for l in links:
print (l)
print (src)

trim.py Normal file
View File

@ -0,0 +1,54 @@
from __future__ import print_function
import html5lib, sys, re
from xml.etree import cElementTree as ET
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def iterparent(tree):
for parent in tree.iter():
for child in parent:
yield parent, child
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
def trim_removed_spans_src (src):
t = html5lib.parse(src, namespaceHTMLElements=False)
return contents(t.find("./body"))
if __name__ == "__main__":
src = sys.stdin.read()
# t = html5lib.parse(src, namespaceHTMLElements=False)
# trim_rems_tree(t)
# print (ET.tostring(t))
print (trim_removed_spans_src(src).encode("utf-8"))