commit 771d76f67c42ab927c189fb3962e9119ee6e2869 Author: Michael Murtaugh Date: Thu Feb 26 13:54:26 2015 +0100 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ee4f67 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +venv/ +sites/ +*.pyc +*~ diff --git a/README.md b/README.md new file mode 100644 index 0000000..c630c6c --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +etherdump +========= + +Tool to make archival dumps of etherpad pages. diff --git a/etherdump b/etherdump new file mode 100755 index 0000000..576f4f0 --- /dev/null +++ b/etherdump @@ -0,0 +1,206 @@ +#!/usr/bin/python + +from __future__ import print_function +import sys, argparse, json, re, os, time +from urllib2 import urlopen, HTTPError, URLError +import html5lib, urllib2, urllib +from urllib import urlencode +from urlparse import urljoin + +PADINFO_DEFAULTS = { + "hostname": "", + "apiversion": "1.2.9", + "apiurl": "/api/" +} +verbose = False + +def listAllPads (apiURL, apikey): + data = {'apikey': apikey} + url = apiURL+'listAllPads?'+urlencode(data) + if verbose: + print (url, file=sys.stderr) + resp = json.load(urlopen(url)) + return resp['data']['padIDs'] + +def listAllGroups (apiURL, apikey): + data = {'apikey': apikey} + url = apiURL+'listAllGroups?'+urlencode(data) + if verbose: + print (url, file=sys.stderr) + resp = json.load(urlopen(url)) + return resp['data']['groupIDs'] + +def getPadText (padID, apiURL, apikey): + data = {'apikey': apikey, 'padID': padID} + resp = json.load(urlopen(apiURL+'getText?'+urlencode(data))) + return resp['data']['text'] + +def getPadHTML (padID, apiURL, apikey): + data = {'apikey': apikey, 'padID': padID} + resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data))) + return resp['data']['html'] + +def getPadLastEdited (padID, apiURL, apikey): + r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID}))) + return r['data']['lastEdited'] + +def pad_split_group (n): + m = re.match(r"g\.(\w+)\$(.+)$", n) + if m: + return m.groups() + else: + return ('', n) + +def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): + for padid in padids: + group_id, pad_name = pad_split_group(padid) + if group_id: + try: + os.mkdir(group_path) + except OSError: + pass + try: + os.mkdir(os.path.join(group_path, group_id)) + except OSError: + pass + fp = os.path.join(group_path, group_id, pad_name) + else: + try: + os.mkdir(pub_path) + except OSError: + pass + fp = os.path.join(pub_path, pad_name) + + if verbose: + print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) + + if skip_existing: + if os.path.exists(fp+".json"): + continue + + # Write Metadata + meta = { + 'padid': padid, + 'groupID': group_id, + 'padname': pad_name + } + url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) + try: + resp = json.load(urlopen(url)) + meta['lastEdited'] = resp['data']['lastEdited'] + except (TypeError, HTTPError, ValueError) as e: + print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + + # Write Text + with open(fp+".utf8.txt", "w") as f: + url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) + try: + resp = json.load(urlopen(url)) + text = resp['data']['text'].encode("utf-8") + f.write(text) + meta['text_length'] = len(text) + + except (TypeError, HTTPError, ValueError) as e: + print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + + with open(fp+ ".utf8.html", "w") as f: + url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) + try: + resp = json.load(urlopen(url)) + text = resp['data']['html'].encode("utf-8") + f.write(text) + meta['html_length'] = len(text) + + except (TypeError, HTTPError, ValueError) as e: + print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + + with open(fp+".json", "w") as f: + f.write(json.dumps(meta)) + + if sleeptime: + time.sleep(sleeptime) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') + + parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') + parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') + parser.add_argument('--port', type=int, help='port of etherpad server') + parser.add_argument('--apikey', help='API key') + parser.add_argument('--apiversion', help='the version of the etherpad api') + parser.add_argument('--apiurl', help='URL path to the API') + + parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') + parser.add_argument('--pubpath', default="pub", help='path to dump public pads') + parser.add_argument('--grouppath', default="priv", help='path to dump group pads') + parser.add_argument('--human', default=False, action="store_true", help='output for reading') + parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') + + args = parser.parse_args() + + verbose = args.verbose + padinfo = PADINFO_DEFAULTS + if args.padinfo: + try: + with open(args.padinfo) as f: + for key, value in json.load(f).items(): + padinfo[key] = value + except IOError, e: + print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) + except ValueError, e: + print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) + + # allow explicit opts to override + if args.hostname: + padinfo['hostname'] = args.hostname + if args.port: + padinfo['port'] = args.port + if args.apikey: + padinfo['apikey'] = args.apikey + if args.apiversion: + padinfo['apiversion'] = args.apiversion + if args.apiurl: + padinfo['apiurl'] = args.apiurl + + # Construct the base API URL + apiurl = "http://" + padinfo.get("hostname") + if padinfo.get("port"): + apiurl += ":{0}".format(padinfo['port']) + apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) + apikey = padinfo.get("apikey") + + if verbose: + print ("Connecting to {0}".format(apiurl), file=sys.stderr) + + ############################### + # Command Dispatch + ############################### + + cmd = args.command.lower() + if cmd == "listpads": + padids = listAllPads(apiurl, apikey) + if not args.human: + json.dump(padids, sys.stdout) + else: + for padid in padids: + print(padid) + + elif cmd == "listgroups": + groupids = listAllGroups(apiurl, apikey) + if not args.human: + json.dump(groupids, sys.stdout) + else: + for gid in groupids: + print(gid) + + elif cmd == "dump": + start = time.time() + padids = listAllPads(apiurl, apikey) + dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) + if verbose: + print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) + + else: + print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)