Pumping pads as files into publishing frameworks!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

101 lines
3.1 KiB

#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents
from linkify import linkify, urlify
import jinja2
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
args = p.parse_args()
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
todo = [args.padid]
done = set()
count = 0
env = get_template_env(args.templates)
template = env.get_template(args.template)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
try:
os.makedirs(args.path)
except OSError:
pass
with open(out, "w") as f:
t = html5lib.parse(html, namespaceHTMLElements=False)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
body = t.find(".//body")
html = contents(body)
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
count += 1
if args.limit and count >= args.limit:
break