Michael Murtaugh
9 years ago
3 changed files with 152 additions and 0 deletions
@ -0,0 +1,69 @@ |
|||
#!/usr/bin/env python |
|||
from __future__ import print_function |
|||
from argparse import ArgumentParser |
|||
import json, sys, os |
|||
from urllib import urlencode |
|||
from urllib2 import urlopen, HTTPError, URLError |
|||
from xml.etree import cElementTree as ET |
|||
import html5lib |
|||
from trim import trim_removed_spans, contents |
|||
from linkify import linkify, urlify |
|||
|
|||
|
|||
p = ArgumentParser("") |
|||
p.add_argument("padid", help="the padid") |
|||
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") |
|||
p.add_argument("--path", default="output", help="path to save files, default: output") |
|||
p.add_argument("--verbose", default=False, action="store_true") |
|||
p.add_argument("--limit", type=int, default=None) |
|||
args = p.parse_args() |
|||
|
|||
with open(args.padinfo) as f: |
|||
info = json.load(f) |
|||
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) |
|||
|
|||
todo = [args.padid] |
|||
done = set() |
|||
count = 0 |
|||
|
|||
while len(todo) > 0: |
|||
padid = todo[0] |
|||
todo = todo[1:] |
|||
done.add(padid) |
|||
|
|||
data = {} |
|||
data['apikey'] = info['apikey'] |
|||
data['padID'] = padid.encode("utf-8") |
|||
|
|||
out = "{0}/{1}".format(args.path, urlify(padid)) |
|||
print ("{0}".format(out), file=sys.stderr) |
|||
|
|||
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) |
|||
total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] |
|||
if args.verbose: |
|||
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) |
|||
|
|||
data['startRev'] = "0" |
|||
requesturl = apiurl+'createDiffHTML?'+urlencode(data) |
|||
html = json.load(urlopen(requesturl))['data']['html'] |
|||
t = html5lib.parse(html, namespaceHTMLElements=False) |
|||
trim_removed_spans(t) |
|||
html = ET.tostring(t, method="html") |
|||
|
|||
html, links = linkify(html) |
|||
for l in links: |
|||
if l not in todo and l not in done: |
|||
if args.verbose: |
|||
print (" link: {0}".format(l), file=sys.stderr) |
|||
todo.append(l) |
|||
|
|||
try: |
|||
os.makedirs(args.path) |
|||
except OSError: |
|||
pass |
|||
with open(out, "w") as f: |
|||
f.write(html.encode("utf-8")) |
|||
|
|||
count += 1 |
|||
if args.limit and count >= args.limit: |
|||
break |
@ -0,0 +1,29 @@ |
|||
from __future__ import print_function |
|||
import re, sys |
|||
|
|||
|
|||
def urlify (t): |
|||
return t.replace(" ", "_") + ".html" |
|||
|
|||
def linkify (src, urlify=urlify): |
|||
|
|||
collect = [] |
|||
|
|||
def s (m): |
|||
contents = m.group(1) |
|||
collect.append(contents) |
|||
link = urlify(contents) |
|||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) |
|||
|
|||
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src) |
|||
return (src, collect) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
src = sys.stdin.read() |
|||
src, links = linkify(src) |
|||
|
|||
for l in links: |
|||
print (l) |
|||
|
|||
print (src) |
@ -0,0 +1,54 @@ |
|||
from __future__ import print_function |
|||
import html5lib, sys, re |
|||
from xml.etree import cElementTree as ET |
|||
|
|||
|
|||
def contents (element, method="html"): |
|||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) |
|||
|
|||
def iterparent(tree): |
|||
for parent in tree.iter(): |
|||
for child in parent: |
|||
yield parent, child |
|||
|
|||
def get_parent(tree, elt): |
|||
for parent in tree.iter(): |
|||
for child in parent: |
|||
if child == elt: |
|||
return parent |
|||
|
|||
def remove_recursive (tree, elt): |
|||
""" Remove element and (any resulting) empty containing elements """ |
|||
p = get_parent(tree, elt) |
|||
if p: |
|||
p.remove(elt) |
|||
if len(p) == 0 and (p.text == None or p.text.strip() == ""): |
|||
# print ("empty parent", p, file=sys.stderr) |
|||
remove_recursive(tree, p) |
|||
|
|||
|
|||
def trim_removed_spans (t): |
|||
# remove <span class="removed"> and empty parents |
|||
for n in t.findall(".//span[@class='removed']"): |
|||
remove_recursive(t, n) |
|||
# then strip any leading br's from body |
|||
while True: |
|||
tag = t.find("./body")[0] |
|||
if tag.tag == "br": |
|||
remove_recursive(t, tag) |
|||
else: |
|||
break |
|||
|
|||
def trim_removed_spans_src (src): |
|||
t = html5lib.parse(src, namespaceHTMLElements=False) |
|||
trim_removed_spans(t) |
|||
return contents(t.find("./body")) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
src = sys.stdin.read() |
|||
# t = html5lib.parse(src, namespaceHTMLElements=False) |
|||
# trim_rems_tree(t) |
|||
# print (ET.tostring(t)) |
|||
print (trim_removed_spans_src(src).encode("utf-8")) |
|||
|
Loading…
Reference in new issue