etherpump/etherdump

#!/usr/bin/env python

from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime

PADINFO_DEFAULTS = {
    "hostname": "",
    "apiversion": "1.2.9",
    "apiurl": "/api/"
}

verbose = False

def pad_split_group (n):
    m = re.match(r"g\.(\w+)\$(.+)$", n)
    if m:
        return m.groups()
    else:
        return ('', n)

def content(tag):
    if tag.text == None:
        return u''.join(ET.tostring(e) for e in tag)
    else:
        return tag.text + u''.join(ET.tostring(e) for e in tag)

class PadServer (object):
    def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
        self.hostname = hostname
        if secure:
            self.protocol = "https"
        else:
            self.protocol = "http"

        self.apiurl = self.protocol+"://"+hostname
        if port:
            self.apiurl += ":{0}".format(port)
        self.apiurl += "{0}{1}/".format(apipath, apiversion)
        self.apikey = apikey

    def listAllPads (self):
        data = {'apikey': self.apikey}
        url = self.apiurl+'listAllPads?'+urlencode(data)
        return json.load(urlopen(url))['data']['padIDs']

    def listAllGroups (self):
        data = {'apikey': self.apikey}
        url = self.apiurl+'listAllGroups?'+urlencode(data)
        return json.load(urlopen(url))['data']['groupIDs']

    def getPadText (self, padID):
        data = {'apikey': self.apikey, 'padID': padID}
        return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']

    def getPadHTML (self, padID):
        data = {'apikey': self.apikey, 'padID': padID}
        return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']

    def getPadLastEdited (self, padID):
        raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited']
        return datetime.fromtimestamp(int(raw)/1000)

    def getPadURL (self, padID):
        group, name = pad_split_group(padID)
        if group:
            return self.protocol+"://"+self.hostname+"/p/"+padID
        else:
            return self.protocol+"://"+self.hostname+"/public_pad/"+padID


def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, template=None):
    if template != None:
        import jinja2
        with open(template) as f:
            template = jinja2.Template(f.read().decode("utf-8"))

    for padid in padids:
        group_id, pad_name = pad_split_group(padid)
        if group_id:
            try:
                os.mkdir(group_path)
            except OSError:
                pass
            try:
                os.mkdir(os.path.join(group_path, group_id))
            except OSError:
                pass
            fp = os.path.join(group_path, group_id, pad_name)
        else:
            try:
                os.mkdir(pub_path)
            except OSError:
                pass
            fp = os.path.join(pub_path, pad_name)

        if verbose:
            print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)

        if skip_existing:
            if os.path.exists(fp+".json"):
                continue

        # Write Metadata
        meta = {
            'pad_id': padid,
            'group_id': group_id,
            'pad_name': pad_name
        }
        meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat()

        # Write Text
        textpath = fp + ".txt"
        with open(textpath, "w") as f:
            try:
                text = padserver.getPadText(padid)
                f.write(text.encode("utf-8"))
                meta['text_path'] = textpath
                meta['text_length'] = len(text)
                meta['text_length_human'] = humanize_bytes(meta['text_length'])

            except (TypeError, HTTPError, ValueError) as e:
                print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)

        htmlpath = fp+".html"
        with open(htmlpath, "w") as f:
            html = padserver.getPadHTML(padid)
            meta['html_path'] = htmlpath
            meta['html_length'] = len(html)
            if template:
                t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
                body = t.find(".//body")
                title = padid
                editurl = padserver.getPadURL(padid)
                meta['url'] = editurl
                f.write(template.render(
                    body=content(body),
                    title=title,
                    editurl=editurl,
                    sourceurl=textpath,
                    metadata_json=json.dumps(meta))) # unicode error HERE!
            else:
                f.write(html.encode("utf-8"))

            # except (TypeError, HTTPError, ValueError) as e:
            #     print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)

        with open(fp+".json", "w") as f:
            f.write(json.dumps(meta))

        if sleeptime:
            time.sleep(sleeptime)

def humanize_bytes(bytes, precision=0):
    """Return a humanized string representation of a number of bytes.

    Assumes `from __future__ import division`.

    >>> humanize_bytes(1)
    '1 byte'
    >>> humanize_bytes(1024)
    '1.0 kB'
    >>> humanize_bytes(1024*123)
    '123.0 kB'
    >>> humanize_bytes(1024*12342)
    '12.1 MB'
    >>> humanize_bytes(1024*12342,2)
    '12.05 MB'
    >>> humanize_bytes(1024*1234,2)
    '1.21 MB'
    >>> humanize_bytes(1024*1234*1111,2)
    '1.31 GB'
    >>> humanize_bytes(1024*1234*1111,1)
    '1.3 GB'
    """
    abbrevs = (
        (1<<50L, 'Petabyte'),
        (1<<40L, 'Tb'),
        (1<<30L, 'Gb'),
        (1<<20L, 'Mb'),
        (1<<10L, 'kb'),
        (1, 'bytes')
    )
    if bytes == 1:
        return '1 byte'
    for factor, suffix in abbrevs:
        if bytes >= factor:
            break
    return '%.*f %s' % (precision, bytes / factor, suffix)

def padids_from_path (path):
    from glob import glob
    inputs = glob(os.path.join(path, "*.json"))
    inputs.sort()
    pads = []
    for fp in inputs:
        with open(fp) as f:
            info = json.load(f)
            info['path'] = fp
            pads.append(info)
    return pads


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')

    parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
    parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
    parser.add_argument('--port', type=int, help='port of etherpad server')
    parser.add_argument('--apikey', help='API key')
    parser.add_argument('--apiversion', help='the version of the etherpad api')
    parser.add_argument('--apiurl', help='URL path to the API')

    parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
    parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
    parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
    parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON')
    parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
    parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')

    # DUMP
    parser.add_argument('--template', default="templates/pad.html", help='path for (dump) template, default: templates/pad.html')

    # OPTIONS SPECIFIC TO CREATEINDEX
    parser.add_argument('--exclude-groups', default=False, action="store_true", help='(createindex) ignore groups')
    parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file')
    parser.add_argument('--indextemplate', default="templates/index.html", help='(createindex) path for template, default: templates/index.html')
    parser.add_argument('--indextitle', default="etherpad archive & index", help='(createindex) title')
    parser.add_argument('--indexcss', default="styles.css", help='(createindex) index: css url')
    parser.add_argument('--output', default=None, help='(createindex) path for output (default stdout)')


    args = parser.parse_args()

    verbose = args.verbose
    padinfo = PADINFO_DEFAULTS
    if args.padinfo:
        try:
            with open(args.padinfo) as f:
                for key, value in json.load(f).items():
                    padinfo[key] = value
        except IOError, e:
            print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
        except ValueError, e:
            print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))

    # allow explicit opts to override

    if args.hostname:
        padinfo['hostname'] = args.hostname
    if args.port:
        padinfo['port'] = args.port
    if args.apikey:
        padinfo['apikey'] = args.apikey
    if args.apiversion:
        padinfo['apiversion'] = args.apiversion
    if args.apiurl:
        padinfo['apiurl'] = args.apiurl

    padserver = PadServer(
        hostname=padinfo.get("hostname"),
        port=padinfo.get("port"),
        apipath=padinfo.get("apiurl"),
        apiversion=padinfo.get("apiversion"),
        apikey=padinfo.get("apikey")
    )

    if verbose:
        print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)

    ###############################
    # Command Dispatch
    ###############################

    cmd = args.command.lower()
    if cmd == "listpads":
        padids = padserver.listAllPads()
        if not args.lines:
            json.dump(padids, sys.stdout)
        else:
            for padid in padids:
                print(padid)

    elif cmd == "listgroups":
        groupids = padserver.listAllGroups()
        if not args.lines:
            json.dump(groupids, sys.stdout)
        else:
            for gid in groupids:
                print(gid)

    elif cmd == "dump":
        start = time.time()
        padids = padserver.listAllPads()
        if args.limit:
            padids = padids[:args.limit]
        dumpPads(
            padserver,
            padids,
            args.pubpath,
            args.grouppath,
            args.skip_existing,
            template=args.template)
        if verbose:
            print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)

    elif cmd == "createindex":

        def get_pads(groupinfo=None):
            pads = padids_from_path(args.pubpath)
            print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
            if not args.exclude_groups and os.path.exists(args.grouppath):
                groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)]
                groups = [x for x in groups if os.path.isdir(x)]
                groups.sort()
                for gp in groups:
                    if groupinfo:
                        b = os.path.basename(gp)
                        if b not in groupinfo:
                            continue
                    try:
                        pads.extend(padids_from_path(gp))
                    except OSError:
                        pass
            return pads

        groupinfo = None
        if args.groupinfo:
            with open(args.groupinfo) as gif:
                groupinfo = json.load(gif)

        pads = get_pads(groupinfo)
        padids = [(x.get("pad_name").lower(), x) for x in pads]
        padids.sort()
        pads = [x[1] for x in padids]

        out = sys.stdout
        if args.output:
            out = open(args.output, "w")

        import jinja2
        with open(args.indextemplate) as f:
            template = jinja2.Template(f.read().decode("utf-8"))
            out.write(template.render(
                title=args.indextitle,
                css=args.indexcss,
                pads = pads
            ))

        if args.output:
            output.close()


    else:
        print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)