#!/usr/bin/env python from __future__ import print_function import sys, argparse, json, re, os, time from urllib2 import urlopen, HTTPError, URLError import html5lib, urllib2, urllib from xml.etree import ElementTree as ET from urllib import urlencode from urlparse import urljoin from datetime import datetime PADINFO_DEFAULTS = { "hostname": "", "apiversion": "1.2.9", "apiurl": "/api/" } MODULE_PATH = (os.path.dirname(__file__)) TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates") verbose = False def pad_split_group (n): m = re.match(r"g\.(\w+)\$(.+)$", n) if m: return m.groups() else: return ('', n) def content(tag): if tag.text == None: return u''.join(ET.tostring(e) for e in tag) else: return tag.text + u''.join(ET.tostring(e) for e in tag) class PadServer (object): def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False): self.hostname = hostname if secure: self.protocol = "https" else: self.protocol = "http" self.apiurl = self.protocol+"://"+hostname if port: self.apiurl += ":{0}".format(port) self.apiurl += "{0}{1}/".format(apipath, apiversion) self.apikey = apikey def listAllPads (self): data = {'apikey': self.apikey} url = self.apiurl+'listAllPads?'+urlencode(data) return json.load(urlopen(url))['data']['padIDs'] def listAllGroups (self): data = {'apikey': self.apikey} url = self.apiurl+'listAllGroups?'+urlencode(data) return json.load(urlopen(url))['data']['groupIDs'] def getPadText (self, padID): data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text'] def getPadHTML (self, padID): data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html'] def getPadLastEdited (self, padID): data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode(data)))['data']['lastEdited'] return datetime.fromtimestamp(int(raw)/1000) def getPadURL (self, padID): group, name = pad_split_group(padID) if group: return self.protocol+"://"+self.hostname+"/p/"+padID else: return self.protocol+"://"+self.hostname+"/public_pad/"+padID def get_template_env (tpath=None): import jinja2 paths = [] if tpath and os.path.isdir(tpath): paths.append(tpath) paths.append(TEMPLATES_PATH) loader = jinja2.FileSystemLoader(paths) env = jinja2.Environment(loader=loader) return env # template = env.get_template('pad.html') # print template.render(the='variables', go='here').encode("utf-8") def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None): template_env = get_template_env(templates) pad_template = template_env.get_template("pad.html") numpads = len(padids) for i, padid in enumerate(padids): group_id, pad_name = pad_split_group(padid) if group_id: try: os.mkdir(os.path.join(outputpath, group_path)) except OSError: pass try: os.mkdir(os.path.join(outputpath, group_path, group_id)) except OSError: pass fp = os.path.join(outputpath, group_path, group_id, pad_name) else: try: os.mkdir(os.path.join(outputpath, pub_path)) except OSError: pass fp = os.path.join(outputpath, pub_path, pad_name) if verbose: print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) else: sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads)) sys.stderr.flush() textpath = fp + ".txt" htmlpath = fp+".html" metapath = fp+".json" last_edited = padserver.getPadLastEdited(padid).isoformat() if os.path.exists(metapath): with open(metapath) as f: meta = json.load(f) if not force and meta.get("last_edited") == last_edited: if verbose: print("Up to date, skipping", file=sys.stderr) continue meta = { 'pad_id': padid, 'group_id': group_id, 'pad_name': pad_name } meta['last_edited'] = last_edited # Write Text with open(textpath, "w") as f: try: text = padserver.getPadText(padid) f.write(text.encode("utf-8")) meta['text_path'] = os.path.relpath(textpath, outputpath) meta['text_length'] = len(text) meta['text_length_human'] = humanize_bytes(meta['text_length']) except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) with open(htmlpath, "w") as f: html = padserver.getPadHTML(padid) meta['html_path'] = os.path.relpath(htmlpath, outputpath) meta['html_length'] = len(html) if pad_template: t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) body = t.find(".//body") title = padid editurl = padserver.getPadURL(padid) meta['url'] = editurl json_dump = json.dumps(meta) f.write(pad_template.render( body=content(body), title=title, editurl=editurl, sourceurl=textpath, metadata_json=json_dump).encode("utf-8")) # unicode error HERE! else: f.write(html.encode("utf-8")) # except (TypeError, HTTPError, ValueError) as e: # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) with open(metapath, "w") as f: f.write(json.dumps(meta)) if sleeptime: time.sleep(sleeptime) if not verbose: sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads)) sys.stderr.flush() def humanize_bytes(bytes, precision=0): """Return a humanized string representation of a number of bytes. Assumes `from __future__ import division`. >>> humanize_bytes(1) '1 byte' >>> humanize_bytes(1024) '1.0 kB' >>> humanize_bytes(1024*123) '123.0 kB' >>> humanize_bytes(1024*12342) '12.1 MB' >>> humanize_bytes(1024*12342,2) '12.05 MB' >>> humanize_bytes(1024*1234,2) '1.21 MB' >>> humanize_bytes(1024*1234*1111,2) '1.31 GB' >>> humanize_bytes(1024*1234*1111,1) '1.3 GB' """ abbrevs = ( (1<<50L, 'Petabyte'), (1<<40L, 'Tb'), (1<<30L, 'Gb'), (1<<20L, 'Mb'), (1<<10L, 'kb'), (1, 'bytes') ) if bytes == 1: return '1 byte' for factor, suffix in abbrevs: if bytes >= factor: break return '%.*f %s' % (precision, bytes / factor, suffix) def padids_from_path (path): from glob import glob inputs = glob(os.path.join(path, "*.json")) inputs.sort() pads = [] for fp in inputs: with open(fp) as f: info = json.load(f) info['path'] = fp pads.append(info) return pads if __name__ == "__main__": parser = argparse.ArgumentParser() # command parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') # padinfo parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') parser.add_argument('--port', type=int, help='port of etherpad server') parser.add_argument('--apikey', help='API key') parser.add_argument('--apiversion', help='the version of the etherpad api') parser.add_argument('--apiurl', help='URL path to the API') parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .') parser.add_argument('--pubpath', default="pub", help='path to dump public pads') parser.add_argument('--grouppath', default="priv", help='path to dump group pads') parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates') # listpads/groups-specific parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON') # dump-specific parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date') parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)') parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') # index-specific parser.add_argument('--title', default="etherpad index & archive", help='(index) title') parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups') parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file') parser.add_argument('--output', default=None, help='(index) path for output (default stdout)') args = parser.parse_args() verbose = args.verbose padinfo = PADINFO_DEFAULTS if args.padinfo: try: with open(args.padinfo) as f: for key, value in json.load(f).items(): padinfo[key] = value except IOError, e: print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) except ValueError, e: print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) # allow explicit opts to override if args.hostname: padinfo['hostname'] = args.hostname if args.port: padinfo['port'] = args.port if args.apikey: padinfo['apikey'] = args.apikey if args.apiversion: padinfo['apiversion'] = args.apiversion if args.apiurl: padinfo['apiurl'] = args.apiurl padserver = PadServer( hostname=padinfo.get("hostname"), port=padinfo.get("port"), apipath=padinfo.get("apiurl"), apiversion=padinfo.get("apiversion"), apikey=padinfo.get("apikey") ) if verbose: print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr) ############################### # Command Dispatch ############################### cmd = args.command.lower() if cmd == "listpads": padids = padserver.listAllPads() if not args.lines: json.dump(padids, sys.stdout) else: for padid in padids: print(padid) elif cmd == "listgroups": groupids = padserver.listAllGroups() if not args.lines: json.dump(groupids, sys.stdout) else: for gid in groupids: print(gid) elif cmd == "dump": start = time.time() padids = padserver.listAllPads() if args.skip: padids = padids[args.skip:] if args.limit: padids = padids[:args.limit] dumpPads( padserver, padids, args.outputpath, args.pubpath, args.grouppath, force=args.force, templates=args.templates) if verbose: print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) elif cmd == "index": def augment_info(info, groupinfo): if info.get("last_edited") != None: dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" ) info['last_edited_parsed'] = dt info['last_edited_str'] = str(dt) if groupinfo: gid = info.get("group_id") if gid.startswith("g."): gid = gid[2:] if gid in groupinfo: info[u"group_name"] = groupinfo[gid].get("name") # print (info, file=sys.stderr) return info def get_pads(groupinfo=None): pads = padids_from_path(args.pubpath) pads = [augment_info(x, groupinfo) for x in pads] # print (("padids_from_path", args.pubpath, pads), file=sys.stderr) if not args.exclude_groups and os.path.exists(args.grouppath): groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)] groups = [x for x in groups if os.path.isdir(x)] groups.sort() for gp in groups: if groupinfo: b = os.path.basename(gp) if b not in groupinfo: continue try: pad_infos = padids_from_path(gp) pad_infos = [augment_info(x, groupinfo) for x in pad_infos] pads.extend(pad_infos) except OSError: pass return pads groupinfo = None if args.groupinfo: with open(args.groupinfo) as gif: groupinfo = json.load(gif) if verbose: print ("Using groupinfo", file=sys.stderr) pads = get_pads(groupinfo) padids = [(x.get("pad_name").lower(), x) for x in pads] padids.sort() pads = [x[1] for x in padids] out = sys.stdout if args.output: out = open(args.output, "w") import jinja2 env = get_template_env(args.templates) index_template = env.get_template("index.html") out.write(index_template.render( pads = pads, title = args.title, timestamp = datetime.now() ).encode("utf-8")) if args.output: out.close() else: print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)