working on index dumping

This commit is contained in:
Michael Murtaugh 2015-02-26 17:15:41 +01:00
parent 771d76f67c
commit c406f812ac
2 changed files with 236 additions and 71 deletions

View File

@ -1,4 +1,13 @@
etherdump etherdump
========= =========
Tool to make archival dumps of etherpad pages. Tool to publish [etherpad](http://etherpad.org/) pages to (archival) HTML.
Requirements
-------------
Python (2.7) with:
* html5lib
* jinja2

296
etherdump
View File

@ -1,49 +1,22 @@
#!/usr/bin/python #!/usr/bin/env python
from __future__ import print_function from __future__ import print_function
import sys, argparse, json, re, os, time import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode from urllib import urlencode
from urlparse import urljoin from urlparse import urljoin
from datetime import datetime
PADINFO_DEFAULTS = { PADINFO_DEFAULTS = {
"hostname": "", "hostname": "",
"apiversion": "1.2.9", "apiversion": "1.2.9",
"apiurl": "/api/" "apiurl": "/api/"
} }
verbose = False verbose = False
def listAllPads (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllPads?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['padIDs']
def listAllGroups (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllGroups?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['groupIDs']
def getPadText (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data)))
return resp['data']['text']
def getPadHTML (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data)))
return resp['data']['html']
def getPadLastEdited (padID, apiURL, apikey):
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID})))
return r['data']['lastEdited']
def pad_split_group (n): def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n) m = re.match(r"g\.(\w+)\$(.+)$", n)
if m: if m:
@ -51,7 +24,62 @@ def pad_split_group (n):
else: else:
return ('', n) return ('', n)
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited']
return datetime.fromtimestamp(int(raw)/1000)
def getPadURL (self, padID):
group, name = pad_split_group(padID)
if group:
return self.protocol+"://"+self.hostname+"/p/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, template=None):
if template != None:
import jinja2
with open(template) as f:
template = jinja2.Template(f.read().decode("utf-8"))
for padid in padids: for padid in padids:
group_id, pad_name = pad_split_group(padid) group_id, pad_name = pad_split_group(padid)
if group_id: if group_id:
@ -80,39 +108,47 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip
# Write Metadata # Write Metadata
meta = { meta = {
'padid': padid, 'pad_id': padid,
'groupID': group_id, 'group_id': group_id,
'padname': pad_name 'pad_name': pad_name
} }
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat()
try:
resp = json.load(urlopen(url))
meta['lastEdited'] = resp['data']['lastEdited']
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
# Write Text # Write Text
with open(fp+".utf8.txt", "w") as f: textpath = fp + ".txt"
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) with open(textpath, "w") as f:
try: try:
resp = json.load(urlopen(url)) text = padserver.getPadText(padid)
text = resp['data']['text'].encode("utf-8") f.write(text.encode("utf-8"))
f.write(text) meta['text_path'] = textpath
meta['text_length'] = len(text) meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e: except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+ ".utf8.html", "w") as f: htmlpath = fp+".html"
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) with open(htmlpath, "w") as f:
try: html = padserver.getPadHTML(padid)
resp = json.load(urlopen(url)) meta['html_path'] = htmlpath
text = resp['data']['html'].encode("utf-8") meta['html_length'] = len(html)
f.write(text) if template:
meta['html_length'] = len(text) t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid)
meta['url'] = editurl
f.write(template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json.dumps(meta))) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
except (TypeError, HTTPError, ValueError) as e: # except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+".json", "w") as f: with open(fp+".json", "w") as f:
f.write(json.dumps(meta)) f.write(json.dumps(meta))
@ -120,6 +156,56 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip
if sleeptime: if sleeptime:
time.sleep(sleeptime) time.sleep(sleeptime)
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -135,8 +221,21 @@ if __name__ == "__main__":
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads') parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads') parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--human', default=False, action="store_true", help='output for reading') parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON')
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# DUMP
parser.add_argument('--template', default="templates/pad.html", help='path for (dump) template, default: templates/pad.html')
# OPTIONS SPECIFIC TO CREATEINDEX
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(createindex) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file')
parser.add_argument('--indextemplate', default="templates/index.html", help='(createindex) path for template, default: templates/index.html')
parser.add_argument('--indextitle', default="etherpad archive & index", help='(createindex) title')
parser.add_argument('--indexcss', default="styles.css", help='(createindex) index: css url')
parser.add_argument('--output', default=None, help='(createindex) path for output (default stdout)')
args = parser.parse_args() args = parser.parse_args()
@ -153,6 +252,7 @@ if __name__ == "__main__":
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override # allow explicit opts to override
if args.hostname: if args.hostname:
padinfo['hostname'] = args.hostname padinfo['hostname'] = args.hostname
if args.port: if args.port:
@ -164,15 +264,16 @@ if __name__ == "__main__":
if args.apiurl: if args.apiurl:
padinfo['apiurl'] = args.apiurl padinfo['apiurl'] = args.apiurl
# Construct the base API URL padserver = PadServer(
apiurl = "http://" + padinfo.get("hostname") hostname=padinfo.get("hostname"),
if padinfo.get("port"): port=padinfo.get("port"),
apiurl += ":{0}".format(padinfo['port']) apipath=padinfo.get("apiurl"),
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) apiversion=padinfo.get("apiversion"),
apikey = padinfo.get("apikey") apikey=padinfo.get("apikey")
)
if verbose: if verbose:
print ("Connecting to {0}".format(apiurl), file=sys.stderr) print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
############################### ###############################
# Command Dispatch # Command Dispatch
@ -180,16 +281,16 @@ if __name__ == "__main__":
cmd = args.command.lower() cmd = args.command.lower()
if cmd == "listpads": if cmd == "listpads":
padids = listAllPads(apiurl, apikey) padids = padserver.listAllPads()
if not args.human: if not args.lines:
json.dump(padids, sys.stdout) json.dump(padids, sys.stdout)
else: else:
for padid in padids: for padid in padids:
print(padid) print(padid)
elif cmd == "listgroups": elif cmd == "listgroups":
groupids = listAllGroups(apiurl, apikey) groupids = padserver.listAllGroups()
if not args.human: if not args.lines:
json.dump(groupids, sys.stdout) json.dump(groupids, sys.stdout)
else: else:
for gid in groupids: for gid in groupids:
@ -197,10 +298,65 @@ if __name__ == "__main__":
elif cmd == "dump": elif cmd == "dump":
start = time.time() start = time.time()
padids = listAllPads(apiurl, apikey) padids = padserver.listAllPads()
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.pubpath,
args.grouppath,
args.skip_existing,
template=args.template)
if verbose: if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "createindex":
def get_pads(groupinfo=None):
pads = padids_from_path(args.pubpath)
print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
if not args.exclude_groups and os.path.exists(args.grouppath):
groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pads.extend(padids_from_path(gp))
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(args.output, "w")
import jinja2
with open(args.indextemplate) as f:
template = jinja2.Template(f.read().decode("utf-8"))
out.write(template.render(
title=args.indextitle,
css=args.indexcss,
pads = pads
))
if args.output:
output.close()
else: else:
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr) print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)