#!/usr/bin/python from __future__ import print_function import sys, argparse, json, re, os, time from urllib2 import urlopen, HTTPError, URLError import html5lib, urllib2, urllib from urllib import urlencode from urlparse import urljoin PADINFO_DEFAULTS = { "hostname": "", "apiversion": "1.2.9", "apiurl": "/api/" } verbose = False def listAllPads (apiURL, apikey): data = {'apikey': apikey} url = apiURL+'listAllPads?'+urlencode(data) if verbose: print (url, file=sys.stderr) resp = json.load(urlopen(url)) return resp['data']['padIDs'] def listAllGroups (apiURL, apikey): data = {'apikey': apikey} url = apiURL+'listAllGroups?'+urlencode(data) if verbose: print (url, file=sys.stderr) resp = json.load(urlopen(url)) return resp['data']['groupIDs'] def getPadText (padID, apiURL, apikey): data = {'apikey': apikey, 'padID': padID} resp = json.load(urlopen(apiURL+'getText?'+urlencode(data))) return resp['data']['text'] def getPadHTML (padID, apiURL, apikey): data = {'apikey': apikey, 'padID': padID} resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data))) return resp['data']['html'] def getPadLastEdited (padID, apiURL, apikey): r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID}))) return r['data']['lastEdited'] def pad_split_group (n): m = re.match(r"g\.(\w+)\$(.+)$", n) if m: return m.groups() else: return ('', n) def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): for padid in padids: group_id, pad_name = pad_split_group(padid) if group_id: try: os.mkdir(group_path) except OSError: pass try: os.mkdir(os.path.join(group_path, group_id)) except OSError: pass fp = os.path.join(group_path, group_id, pad_name) else: try: os.mkdir(pub_path) except OSError: pass fp = os.path.join(pub_path, pad_name) if verbose: print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) if skip_existing: if os.path.exists(fp+".json"): continue # Write Metadata meta = { 'padid': padid, 'groupID': group_id, 'padname': pad_name } url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) try: resp = json.load(urlopen(url)) meta['lastEdited'] = resp['data']['lastEdited'] except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) # Write Text with open(fp+".utf8.txt", "w") as f: url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) try: resp = json.load(urlopen(url)) text = resp['data']['text'].encode("utf-8") f.write(text) meta['text_length'] = len(text) except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) with open(fp+ ".utf8.html", "w") as f: url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) try: resp = json.load(urlopen(url)) text = resp['data']['html'].encode("utf-8") f.write(text) meta['html_length'] = len(text) except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) with open(fp+".json", "w") as f: f.write(json.dumps(meta)) if sleeptime: time.sleep(sleeptime) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') parser.add_argument('--port', type=int, help='port of etherpad server') parser.add_argument('--apikey', help='API key') parser.add_argument('--apiversion', help='the version of the etherpad api') parser.add_argument('--apiurl', help='URL path to the API') parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') parser.add_argument('--pubpath', default="pub", help='path to dump public pads') parser.add_argument('--grouppath', default="priv", help='path to dump group pads') parser.add_argument('--human', default=False, action="store_true", help='output for reading') parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') args = parser.parse_args() verbose = args.verbose padinfo = PADINFO_DEFAULTS if args.padinfo: try: with open(args.padinfo) as f: for key, value in json.load(f).items(): padinfo[key] = value except IOError, e: print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) except ValueError, e: print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) # allow explicit opts to override if args.hostname: padinfo['hostname'] = args.hostname if args.port: padinfo['port'] = args.port if args.apikey: padinfo['apikey'] = args.apikey if args.apiversion: padinfo['apiversion'] = args.apiversion if args.apiurl: padinfo['apiurl'] = args.apiurl # Construct the base API URL apiurl = "http://" + padinfo.get("hostname") if padinfo.get("port"): apiurl += ":{0}".format(padinfo['port']) apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) apikey = padinfo.get("apikey") if verbose: print ("Connecting to {0}".format(apiurl), file=sys.stderr) ############################### # Command Dispatch ############################### cmd = args.command.lower() if cmd == "listpads": padids = listAllPads(apiurl, apikey) if not args.human: json.dump(padids, sys.stdout) else: for padid in padids: print(padid) elif cmd == "listgroups": groupids = listAllGroups(apiurl, apikey) if not args.human: json.dump(groupids, sys.stdout) else: for gid in groupids: print(gid) elif cmd == "dump": start = time.time() padids = listAllPads(apiurl, apikey) dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) if verbose: print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) else: print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)