#!/usr/bin/env python from __future__ import print_function from argparse import ArgumentParser import json, sys, os from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from xml.etree import cElementTree as ET import html5lib from trim import trim_removed_spans, contents from linkify import linkify, urlify p = ArgumentParser("") p.add_argument("padid", help="the padid") p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") p.add_argument("--path", default="output", help="path to save files, default: output") p.add_argument("--verbose", default=False, action="store_true") p.add_argument("--limit", type=int, default=None) args = p.parse_args() with open(args.padinfo) as f: info = json.load(f) apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) todo = [args.padid] done = set() count = 0 while len(todo) > 0: padid = todo[0] todo = todo[1:] done.add(padid) data = {} data['apikey'] = info['apikey'] data['padID'] = padid.encode("utf-8") out = "{0}/{1}".format(args.path, urlify(padid)) print ("{0}".format(out), file=sys.stderr) total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] if args.verbose: print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) data['startRev'] = "0" requesturl = apiurl+'createDiffHTML?'+urlencode(data) html = json.load(urlopen(requesturl))['data']['html'] t = html5lib.parse(html, namespaceHTMLElements=False) trim_removed_spans(t) html = ET.tostring(t, method="html") html, links = linkify(html) for l in links: if l not in todo and l not in done: if args.verbose: print (" link: {0}".format(l), file=sys.stderr) todo.append(l) try: os.makedirs(args.path) except OSError: pass with open(out, "w") as f: f.write(html.encode("utf-8")) count += 1 if args.limit and count >= args.limit: break