working on index dumping

This commit is contained in:
Michael Murtaugh 2015-02-26 17:15:41 +01:00
parent 771d76f67c
commit c406f812ac
2 changed files with 236 additions and 71 deletions

View File

@ -1,4 +1,13 @@
etherdump
=========
Tool to make archival dumps of etherpad pages.
Tool to publish [etherpad](http://etherpad.org/) pages to (archival) HTML.
Requirements
-------------
Python (2.7) with:
* html5lib
* jinja2

294
etherdump
View File

@ -1,49 +1,22 @@
#!/usr/bin/python
#!/usr/bin/env python
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
PADINFO_DEFAULTS = {
"hostname": "",
"apiversion": "1.2.9",
"apiurl": "/api/"
}
verbose = False
def listAllPads (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllPads?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['padIDs']
def listAllGroups (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllGroups?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['groupIDs']
def getPadText (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data)))
return resp['data']['text']
def getPadHTML (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data)))
return resp['data']['html']
def getPadLastEdited (padID, apiURL, apikey):
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID})))
return r['data']['lastEdited']
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
@ -51,7 +24,62 @@ def pad_split_group (n):
else:
return ('', n)
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False):
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited']
return datetime.fromtimestamp(int(raw)/1000)
def getPadURL (self, padID):
group, name = pad_split_group(padID)
if group:
return self.protocol+"://"+self.hostname+"/p/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, template=None):
if template != None:
import jinja2
with open(template) as f:
template = jinja2.Template(f.read().decode("utf-8"))
for padid in padids:
group_id, pad_name = pad_split_group(padid)
if group_id:
@ -80,39 +108,47 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip
# Write Metadata
meta = {
'padid': padid,
'groupID': group_id,
'padname': pad_name
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
}
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
try:
resp = json.load(urlopen(url))
meta['lastEdited'] = resp['data']['lastEdited']
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat()
# Write Text
with open(fp+".utf8.txt", "w") as f:
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
textpath = fp + ".txt"
with open(textpath, "w") as f:
try:
resp = json.load(urlopen(url))
text = resp['data']['text'].encode("utf-8")
f.write(text)
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = textpath
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+ ".utf8.html", "w") as f:
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
try:
resp = json.load(urlopen(url))
text = resp['data']['html'].encode("utf-8")
f.write(text)
meta['html_length'] = len(text)
htmlpath = fp+".html"
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
meta['html_path'] = htmlpath
meta['html_length'] = len(html)
if template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid)
meta['url'] = editurl
f.write(template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json.dumps(meta))) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+".json", "w") as f:
f.write(json.dumps(meta))
@ -120,6 +156,56 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip
if sleeptime:
time.sleep(sleeptime)
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@ -135,8 +221,21 @@ if __name__ == "__main__":
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--human', default=False, action="store_true", help='output for reading')
parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON')
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# DUMP
parser.add_argument('--template', default="templates/pad.html", help='path for (dump) template, default: templates/pad.html')
# OPTIONS SPECIFIC TO CREATEINDEX
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(createindex) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file')
parser.add_argument('--indextemplate', default="templates/index.html", help='(createindex) path for template, default: templates/index.html')
parser.add_argument('--indextitle', default="etherpad archive & index", help='(createindex) title')
parser.add_argument('--indexcss', default="styles.css", help='(createindex) index: css url')
parser.add_argument('--output', default=None, help='(createindex) path for output (default stdout)')
args = parser.parse_args()
@ -153,6 +252,7 @@ if __name__ == "__main__":
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
@ -164,15 +264,16 @@ if __name__ == "__main__":
if args.apiurl:
padinfo['apiurl'] = args.apiurl
# Construct the base API URL
apiurl = "http://" + padinfo.get("hostname")
if padinfo.get("port"):
apiurl += ":{0}".format(padinfo['port'])
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion'])
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
if verbose:
print ("Connecting to {0}".format(apiurl), file=sys.stderr)
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
###############################
# Command Dispatch
@ -180,16 +281,16 @@ if __name__ == "__main__":
cmd = args.command.lower()
if cmd == "listpads":
padids = listAllPads(apiurl, apikey)
if not args.human:
padids = padserver.listAllPads()
if not args.lines:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = listAllGroups(apiurl, apikey)
if not args.human:
groupids = padserver.listAllGroups()
if not args.lines:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
@ -197,10 +298,65 @@ if __name__ == "__main__":
elif cmd == "dump":
start = time.time()
padids = listAllPads(apiurl, apikey)
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing)
padids = padserver.listAllPads()
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.pubpath,
args.grouppath,
args.skip_existing,
template=args.template)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "createindex":
def get_pads(groupinfo=None):
pads = padids_from_path(args.pubpath)
print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
if not args.exclude_groups and os.path.exists(args.grouppath):
groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pads.extend(padids_from_path(gp))
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(args.output, "w")
import jinja2
with open(args.indextemplate) as f:
template = jinja2.Template(f.read().decode("utf-8"))
out.write(template.render(
title=args.indextitle,
css=args.indexcss,
pads = pads
))
if args.output:
output.close()
else:
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)