#!/usr/bin/env python from __future__ import print_function from argparse import ArgumentParser import json, sys, os from datetime import datetime import html5lib from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from xml.etree import cElementTree as ET from trim import trim_removed_spans, contents from linkify import linkify, urlify import jinja2 p = ArgumentParser("") p.add_argument("padid", help="the padid") p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") p.add_argument("--path", default="output", help="path to save files, default: output") p.add_argument("--verbose", default=False, action="store_true") p.add_argument("--limit", type=int, default=None) p.add_argument("--templates", default="templates") p.add_argument("--template", default="pad_html.html") args = p.parse_args() def get_template_env (tpath=None): paths = [] if tpath and os.path.isdir(tpath): paths.append(tpath) # paths.append(TEMPLATES_PATH) loader = jinja2.FileSystemLoader(paths) env = jinja2.Environment(loader=loader) return env with open(args.padinfo) as f: info = json.load(f) apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) todo = [args.padid] done = set() count = 0 env = get_template_env(args.templates) template = env.get_template(args.template) while len(todo) > 0: padid = todo[0] todo = todo[1:] done.add(padid) data = {} data['apikey'] = info['apikey'] data['padID'] = padid.encode("utf-8") out = "{0}/{1}".format(args.path, urlify(padid)) print ("{0}".format(out), file=sys.stderr) total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] if args.verbose: print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) data['startRev'] = "0" requesturl = apiurl+'createDiffHTML?'+urlencode(data) html = json.load(urlopen(requesturl))['data']['html'] t = html5lib.parse(html, namespaceHTMLElements=False) trim_removed_spans(t) html = ET.tostring(t, method="html") html, links = linkify(html) for l in links: if l not in todo and l not in done: if args.verbose: print (" link: {0}".format(l), file=sys.stderr) todo.append(l) try: os.makedirs(args.path) except OSError: pass with open(out, "w") as f: t = html5lib.parse(html, namespaceHTMLElements=False) style = t.find(".//style") if style != None: style = ET.tostring(style, method="html") else: style = "" body = t.find(".//body") html = contents(body) # f.write(html.encode("utf-8")) f.write(template.render( html = html, style = style, revision = total_revisions, padid = padid, timestamp = datetime.now() ).encode("utf-8")) count += 1 if args.limit and count >= args.limit: break