dump html with trim and linkify
This commit is contained in:
parent
f175dfc591
commit
76cb1b28a1
69
dump_html.py
Executable file
69
dump_html.py
Executable file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
from argparse import ArgumentParser
|
||||
import json, sys, os
|
||||
from urllib import urlencode
|
||||
from urllib2 import urlopen, HTTPError, URLError
|
||||
from xml.etree import cElementTree as ET
|
||||
import html5lib
|
||||
from trim import trim_removed_spans, contents
|
||||
from linkify import linkify, urlify
|
||||
|
||||
|
||||
p = ArgumentParser("")
|
||||
p.add_argument("padid", help="the padid")
|
||||
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
|
||||
p.add_argument("--path", default="output", help="path to save files, default: output")
|
||||
p.add_argument("--verbose", default=False, action="store_true")
|
||||
p.add_argument("--limit", type=int, default=None)
|
||||
args = p.parse_args()
|
||||
|
||||
with open(args.padinfo) as f:
|
||||
info = json.load(f)
|
||||
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
|
||||
|
||||
todo = [args.padid]
|
||||
done = set()
|
||||
count = 0
|
||||
|
||||
while len(todo) > 0:
|
||||
padid = todo[0]
|
||||
todo = todo[1:]
|
||||
done.add(padid)
|
||||
|
||||
data = {}
|
||||
data['apikey'] = info['apikey']
|
||||
data['padID'] = padid.encode("utf-8")
|
||||
|
||||
out = "{0}/{1}".format(args.path, urlify(padid))
|
||||
print ("{0}".format(out), file=sys.stderr)
|
||||
|
||||
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
|
||||
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
|
||||
if args.verbose:
|
||||
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
|
||||
|
||||
data['startRev'] = "0"
|
||||
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
|
||||
html = json.load(urlopen(requesturl))['data']['html']
|
||||
t = html5lib.parse(html, namespaceHTMLElements=False)
|
||||
trim_removed_spans(t)
|
||||
html = ET.tostring(t, method="html")
|
||||
|
||||
html, links = linkify(html)
|
||||
for l in links:
|
||||
if l not in todo and l not in done:
|
||||
if args.verbose:
|
||||
print (" link: {0}".format(l), file=sys.stderr)
|
||||
todo.append(l)
|
||||
|
||||
try:
|
||||
os.makedirs(args.path)
|
||||
except OSError:
|
||||
pass
|
||||
with open(out, "w") as f:
|
||||
f.write(html.encode("utf-8"))
|
||||
|
||||
count += 1
|
||||
if args.limit and count >= args.limit:
|
||||
break
|
29
linkify.py
Normal file
29
linkify.py
Normal file
@ -0,0 +1,29 @@
|
||||
from __future__ import print_function
|
||||
import re, sys
|
||||
|
||||
|
||||
def urlify (t):
|
||||
return t.replace(" ", "_") + ".html"
|
||||
|
||||
def linkify (src, urlify=urlify):
|
||||
|
||||
collect = []
|
||||
|
||||
def s (m):
|
||||
contents = m.group(1)
|
||||
collect.append(contents)
|
||||
link = urlify(contents)
|
||||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
||||
|
||||
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
|
||||
return (src, collect)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
src = sys.stdin.read()
|
||||
src, links = linkify(src)
|
||||
|
||||
for l in links:
|
||||
print (l)
|
||||
|
||||
print (src)
|
54
trim.py
Normal file
54
trim.py
Normal file
@ -0,0 +1,54 @@
|
||||
from __future__ import print_function
|
||||
import html5lib, sys, re
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
|
||||
def contents (element, method="html"):
|
||||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
||||
|
||||
def iterparent(tree):
|
||||
for parent in tree.iter():
|
||||
for child in parent:
|
||||
yield parent, child
|
||||
|
||||
def get_parent(tree, elt):
|
||||
for parent in tree.iter():
|
||||
for child in parent:
|
||||
if child == elt:
|
||||
return parent
|
||||
|
||||
def remove_recursive (tree, elt):
|
||||
""" Remove element and (any resulting) empty containing elements """
|
||||
p = get_parent(tree, elt)
|
||||
if p:
|
||||
p.remove(elt)
|
||||
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
|
||||
# print ("empty parent", p, file=sys.stderr)
|
||||
remove_recursive(tree, p)
|
||||
|
||||
|
||||
def trim_removed_spans (t):
|
||||
# remove <span class="removed"> and empty parents
|
||||
for n in t.findall(".//span[@class='removed']"):
|
||||
remove_recursive(t, n)
|
||||
# then strip any leading br's from body
|
||||
while True:
|
||||
tag = t.find("./body")[0]
|
||||
if tag.tag == "br":
|
||||
remove_recursive(t, tag)
|
||||
else:
|
||||
break
|
||||
|
||||
def trim_removed_spans_src (src):
|
||||
t = html5lib.parse(src, namespaceHTMLElements=False)
|
||||
trim_removed_spans(t)
|
||||
return contents(t.find("./body"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
src = sys.stdin.read()
|
||||
# t = html5lib.parse(src, namespaceHTMLElements=False)
|
||||
# trim_rems_tree(t)
|
||||
# print (ET.tostring(t))
|
||||
print (trim_removed_spans_src(src).encode("utf-8"))
|
||||
|
Loading…
Reference in New Issue
Block a user