From c406f812ac35fc0cb68307efc35ad17f8b653675 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Thu, 26 Feb 2015 17:15:41 +0100 Subject: [PATCH] working on index dumping --- README.md | 11 +- etherdump | 296 +++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 236 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index c630c6c..d6c2c84 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,13 @@ etherdump ========= -Tool to make archival dumps of etherpad pages. +Tool to publish [etherpad](http://etherpad.org/) pages to (archival) HTML. + + +Requirements +------------- + +Python (2.7) with: + +* html5lib +* jinja2 diff --git a/etherdump b/etherdump index 576f4f0..1b3db4f 100755 --- a/etherdump +++ b/etherdump @@ -1,49 +1,22 @@ -#!/usr/bin/python +#!/usr/bin/env python from __future__ import print_function import sys, argparse, json, re, os, time from urllib2 import urlopen, HTTPError, URLError import html5lib, urllib2, urllib +from xml.etree import ElementTree as ET from urllib import urlencode from urlparse import urljoin +from datetime import datetime PADINFO_DEFAULTS = { "hostname": "", "apiversion": "1.2.9", "apiurl": "/api/" } + verbose = False -def listAllPads (apiURL, apikey): - data = {'apikey': apikey} - url = apiURL+'listAllPads?'+urlencode(data) - if verbose: - print (url, file=sys.stderr) - resp = json.load(urlopen(url)) - return resp['data']['padIDs'] - -def listAllGroups (apiURL, apikey): - data = {'apikey': apikey} - url = apiURL+'listAllGroups?'+urlencode(data) - if verbose: - print (url, file=sys.stderr) - resp = json.load(urlopen(url)) - return resp['data']['groupIDs'] - -def getPadText (padID, apiURL, apikey): - data = {'apikey': apikey, 'padID': padID} - resp = json.load(urlopen(apiURL+'getText?'+urlencode(data))) - return resp['data']['text'] - -def getPadHTML (padID, apiURL, apikey): - data = {'apikey': apikey, 'padID': padID} - resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data))) - return resp['data']['html'] - -def getPadLastEdited (padID, apiURL, apikey): - r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID}))) - return r['data']['lastEdited'] - def pad_split_group (n): m = re.match(r"g\.(\w+)\$(.+)$", n) if m: @@ -51,7 +24,62 @@ def pad_split_group (n): else: return ('', n) -def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): +def content(tag): + if tag.text == None: + return u''.join(ET.tostring(e) for e in tag) + else: + return tag.text + u''.join(ET.tostring(e) for e in tag) + +class PadServer (object): + def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False): + self.hostname = hostname + if secure: + self.protocol = "https" + else: + self.protocol = "http" + + self.apiurl = self.protocol+"://"+hostname + if port: + self.apiurl += ":{0}".format(port) + self.apiurl += "{0}{1}/".format(apipath, apiversion) + self.apikey = apikey + + def listAllPads (self): + data = {'apikey': self.apikey} + url = self.apiurl+'listAllPads?'+urlencode(data) + return json.load(urlopen(url))['data']['padIDs'] + + def listAllGroups (self): + data = {'apikey': self.apikey} + url = self.apiurl+'listAllGroups?'+urlencode(data) + return json.load(urlopen(url))['data']['groupIDs'] + + def getPadText (self, padID): + data = {'apikey': self.apikey, 'padID': padID} + return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text'] + + def getPadHTML (self, padID): + data = {'apikey': self.apikey, 'padID': padID} + return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html'] + + def getPadLastEdited (self, padID): + raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited'] + return datetime.fromtimestamp(int(raw)/1000) + + def getPadURL (self, padID): + group, name = pad_split_group(padID) + if group: + return self.protocol+"://"+self.hostname+"/p/"+padID + else: + return self.protocol+"://"+self.hostname+"/public_pad/"+padID + + +def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, template=None): + if template != None: + import jinja2 + with open(template) as f: + template = jinja2.Template(f.read().decode("utf-8")) + for padid in padids: group_id, pad_name = pad_split_group(padid) if group_id: @@ -80,39 +108,47 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip # Write Metadata meta = { - 'padid': padid, - 'groupID': group_id, - 'padname': pad_name + 'pad_id': padid, + 'group_id': group_id, + 'pad_name': pad_name } - url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) - try: - resp = json.load(urlopen(url)) - meta['lastEdited'] = resp['data']['lastEdited'] - except (TypeError, HTTPError, ValueError) as e: - print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat() # Write Text - with open(fp+".utf8.txt", "w") as f: - url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) + textpath = fp + ".txt" + with open(textpath, "w") as f: try: - resp = json.load(urlopen(url)) - text = resp['data']['text'].encode("utf-8") - f.write(text) + text = padserver.getPadText(padid) + f.write(text.encode("utf-8")) + meta['text_path'] = textpath meta['text_length'] = len(text) + meta['text_length_human'] = humanize_bytes(meta['text_length']) except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) - with open(fp+ ".utf8.html", "w") as f: - url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) - try: - resp = json.load(urlopen(url)) - text = resp['data']['html'].encode("utf-8") - f.write(text) - meta['html_length'] = len(text) + htmlpath = fp+".html" + with open(htmlpath, "w") as f: + html = padserver.getPadHTML(padid) + meta['html_path'] = htmlpath + meta['html_length'] = len(html) + if template: + t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) + body = t.find(".//body") + title = padid + editurl = padserver.getPadURL(padid) + meta['url'] = editurl + f.write(template.render( + body=content(body), + title=title, + editurl=editurl, + sourceurl=textpath, + metadata_json=json.dumps(meta))) # unicode error HERE! + else: + f.write(html.encode("utf-8")) - except (TypeError, HTTPError, ValueError) as e: - print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + # except (TypeError, HTTPError, ValueError) as e: + # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) with open(fp+".json", "w") as f: f.write(json.dumps(meta)) @@ -120,6 +156,56 @@ def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip if sleeptime: time.sleep(sleeptime) +def humanize_bytes(bytes, precision=0): + """Return a humanized string representation of a number of bytes. + + Assumes `from __future__ import division`. + + >>> humanize_bytes(1) + '1 byte' + >>> humanize_bytes(1024) + '1.0 kB' + >>> humanize_bytes(1024*123) + '123.0 kB' + >>> humanize_bytes(1024*12342) + '12.1 MB' + >>> humanize_bytes(1024*12342,2) + '12.05 MB' + >>> humanize_bytes(1024*1234,2) + '1.21 MB' + >>> humanize_bytes(1024*1234*1111,2) + '1.31 GB' + >>> humanize_bytes(1024*1234*1111,1) + '1.3 GB' + """ + abbrevs = ( + (1<<50L, 'Petabyte'), + (1<<40L, 'Tb'), + (1<<30L, 'Gb'), + (1<<20L, 'Mb'), + (1<<10L, 'kb'), + (1, 'bytes') + ) + if bytes == 1: + return '1 byte' + for factor, suffix in abbrevs: + if bytes >= factor: + break + return '%.*f %s' % (precision, bytes / factor, suffix) + +def padids_from_path (path): + from glob import glob + inputs = glob(os.path.join(path, "*.json")) + inputs.sort() + pads = [] + for fp in inputs: + with open(fp) as f: + info = json.load(f) + info['path'] = fp + pads.append(info) + return pads + + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -135,8 +221,21 @@ if __name__ == "__main__": parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') parser.add_argument('--pubpath', default="pub", help='path to dump public pads') parser.add_argument('--grouppath', default="priv", help='path to dump group pads') - parser.add_argument('--human', default=False, action="store_true", help='output for reading') + parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON') parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') + parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') + + # DUMP + parser.add_argument('--template', default="templates/pad.html", help='path for (dump) template, default: templates/pad.html') + + # OPTIONS SPECIFIC TO CREATEINDEX + parser.add_argument('--exclude-groups', default=False, action="store_true", help='(createindex) ignore groups') + parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file') + parser.add_argument('--indextemplate', default="templates/index.html", help='(createindex) path for template, default: templates/index.html') + parser.add_argument('--indextitle', default="etherpad archive & index", help='(createindex) title') + parser.add_argument('--indexcss', default="styles.css", help='(createindex) index: css url') + parser.add_argument('--output', default=None, help='(createindex) path for output (default stdout)') + args = parser.parse_args() @@ -153,6 +252,7 @@ if __name__ == "__main__": print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) # allow explicit opts to override + if args.hostname: padinfo['hostname'] = args.hostname if args.port: @@ -164,15 +264,16 @@ if __name__ == "__main__": if args.apiurl: padinfo['apiurl'] = args.apiurl - # Construct the base API URL - apiurl = "http://" + padinfo.get("hostname") - if padinfo.get("port"): - apiurl += ":{0}".format(padinfo['port']) - apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) - apikey = padinfo.get("apikey") + padserver = PadServer( + hostname=padinfo.get("hostname"), + port=padinfo.get("port"), + apipath=padinfo.get("apiurl"), + apiversion=padinfo.get("apiversion"), + apikey=padinfo.get("apikey") + ) if verbose: - print ("Connecting to {0}".format(apiurl), file=sys.stderr) + print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr) ############################### # Command Dispatch @@ -180,16 +281,16 @@ if __name__ == "__main__": cmd = args.command.lower() if cmd == "listpads": - padids = listAllPads(apiurl, apikey) - if not args.human: + padids = padserver.listAllPads() + if not args.lines: json.dump(padids, sys.stdout) else: for padid in padids: print(padid) elif cmd == "listgroups": - groupids = listAllGroups(apiurl, apikey) - if not args.human: + groupids = padserver.listAllGroups() + if not args.lines: json.dump(groupids, sys.stdout) else: for gid in groupids: @@ -197,10 +298,65 @@ if __name__ == "__main__": elif cmd == "dump": start = time.time() - padids = listAllPads(apiurl, apikey) - dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) + padids = padserver.listAllPads() + if args.limit: + padids = padids[:args.limit] + dumpPads( + padserver, + padids, + args.pubpath, + args.grouppath, + args.skip_existing, + template=args.template) if verbose: print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) + elif cmd == "createindex": + + def get_pads(groupinfo=None): + pads = padids_from_path(args.pubpath) + print (("padids_from_path", args.pubpath, pads), file=sys.stderr) + if not args.exclude_groups and os.path.exists(args.grouppath): + groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)] + groups = [x for x in groups if os.path.isdir(x)] + groups.sort() + for gp in groups: + if groupinfo: + b = os.path.basename(gp) + if b not in groupinfo: + continue + try: + pads.extend(padids_from_path(gp)) + except OSError: + pass + return pads + + groupinfo = None + if args.groupinfo: + with open(args.groupinfo) as gif: + groupinfo = json.load(gif) + + pads = get_pads(groupinfo) + padids = [(x.get("pad_name").lower(), x) for x in pads] + padids.sort() + pads = [x[1] for x in padids] + + out = sys.stdout + if args.output: + out = open(args.output, "w") + + import jinja2 + with open(args.indextemplate) as f: + template = jinja2.Template(f.read().decode("utf-8")) + out.write(template.render( + title=args.indextitle, + css=args.indexcss, + pads = pads + )) + + if args.output: + output.close() + + else: - print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr) + print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)