You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
69 lines
2.1 KiB
69 lines
2.1 KiB
#!/usr/bin/env python
|
|
from __future__ import print_function
|
|
from argparse import ArgumentParser
|
|
import json, sys, os
|
|
from urllib import urlencode
|
|
from urllib2 import urlopen, HTTPError, URLError
|
|
from xml.etree import cElementTree as ET
|
|
import html5lib
|
|
from trim import trim_removed_spans, contents
|
|
from linkify import linkify, urlify
|
|
|
|
|
|
p = ArgumentParser("")
|
|
p.add_argument("padid", help="the padid")
|
|
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
|
|
p.add_argument("--path", default="output", help="path to save files, default: output")
|
|
p.add_argument("--verbose", default=False, action="store_true")
|
|
p.add_argument("--limit", type=int, default=None)
|
|
args = p.parse_args()
|
|
|
|
with open(args.padinfo) as f:
|
|
info = json.load(f)
|
|
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
|
|
|
|
todo = [args.padid]
|
|
done = set()
|
|
count = 0
|
|
|
|
while len(todo) > 0:
|
|
padid = todo[0]
|
|
todo = todo[1:]
|
|
done.add(padid)
|
|
|
|
data = {}
|
|
data['apikey'] = info['apikey']
|
|
data['padID'] = padid.encode("utf-8")
|
|
|
|
out = "{0}/{1}".format(args.path, urlify(padid))
|
|
print ("{0}".format(out), file=sys.stderr)
|
|
|
|
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
|
|
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
|
|
if args.verbose:
|
|
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
|
|
|
|
data['startRev'] = "0"
|
|
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
|
|
html = json.load(urlopen(requesturl))['data']['html']
|
|
t = html5lib.parse(html, namespaceHTMLElements=False)
|
|
trim_removed_spans(t)
|
|
html = ET.tostring(t, method="html")
|
|
|
|
html, links = linkify(html)
|
|
for l in links:
|
|
if l not in todo and l not in done:
|
|
if args.verbose:
|
|
print (" link: {0}".format(l), file=sys.stderr)
|
|
todo.append(l)
|
|
|
|
try:
|
|
os.makedirs(args.path)
|
|
except OSError:
|
|
pass
|
|
with open(out, "w") as f:
|
|
f.write(html.encode("utf-8"))
|
|
|
|
count += 1
|
|
if args.limit and count >= args.limit:
|
|
break
|
|
|