Browse Source

dump html with trim and linkify

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
76cb1b28a1
  1. 69
      dump_html.py
  2. 29
      linkify.py
  3. 54
      trim.py

69
dump_html.py

@ -0,0 +1,69 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
import html5lib
from trim import trim_removed_spans, contents
from linkify import linkify, urlify
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
todo = [args.padid]
done = set()
count = 0
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
try:
os.makedirs(args.path)
except OSError:
pass
with open(out, "w") as f:
f.write(html.encode("utf-8"))
count += 1
if args.limit and count >= args.limit:
break

29
linkify.py

@ -0,0 +1,29 @@
from __future__ import print_function
import re, sys
def urlify (t):
return t.replace(" ", "_") + ".html"
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = m.group(1)
collect.append(contents)
link = urlify(contents)
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
return (src, collect)
if __name__ == "__main__":
src = sys.stdin.read()
src, links = linkify(src)
for l in links:
print (l)
print (src)

54
trim.py

@ -0,0 +1,54 @@
from __future__ import print_function
import html5lib, sys, re
from xml.etree import cElementTree as ET
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def iterparent(tree):
for parent in tree.iter():
for child in parent:
yield parent, child
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
p.remove(elt)
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
else:
break
def trim_removed_spans_src (src):
t = html5lib.parse(src, namespaceHTMLElements=False)
trim_removed_spans(t)
return contents(t.find("./body"))
if __name__ == "__main__":
src = sys.stdin.read()
# t = html5lib.parse(src, namespaceHTMLElements=False)
# trim_rems_tree(t)
# print (ET.tostring(t))
print (trim_removed_spans_src(src).encode("utf-8"))
Loading…
Cancel
Save