etherpump/dump_html.py

102 lines
3.1 KiB
Python
Raw Normal View History

2015-07-23 18:09:20 +02:00
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os
2015-07-23 18:34:36 +02:00
from datetime import datetime
import html5lib
2015-07-23 18:09:20 +02:00
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents
from linkify import linkify, urlify
2015-07-23 18:34:36 +02:00
import jinja2
2015-07-23 18:09:20 +02:00
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
2015-07-23 18:34:36 +02:00
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
2015-07-23 18:09:20 +02:00
args = p.parse_args()
2015-07-23 18:34:36 +02:00
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
2015-07-23 18:09:20 +02:00
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
todo = [args.padid]
done = set()
count = 0
2015-07-23 18:34:36 +02:00
env = get_template_env(args.templates)
template = env.get_template(args.template)
2015-07-23 18:09:20 +02:00
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
try:
os.makedirs(args.path)
except OSError:
pass
with open(out, "w") as f:
2015-07-23 18:34:36 +02:00
t = html5lib.parse(html, namespaceHTMLElements=False)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
body = t.find(".//body")
html = contents(body)
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
2015-07-23 18:09:20 +02:00
count += 1
if args.limit and count >= args.limit:
break