etherpump/etherdump/commands/pull.py


								#!/usr/bin/env python

								from __future__ import print_function

								from argparse import ArgumentParser

								import sys, json, re, os

								from datetime import datetime

								from urllib import urlencode, quote

								from urllib2 import HTTPError

								from common import *

								from time import sleep

								from html5tidy import html5tidy

								import html5lib

								from xml.etree import ElementTree as ET

								# debugging

								# import ElementTree as ET


								"""

								pull(meta):

								    Update meta data files for those that have changed.

								    Check for changed pads by looking at revisions & comparing to existing


								todo...

								use/prefer public interfaces ? (export functions)


								"""


								def try_deleting (files):

								    for f in files:

								        try:

								            os.remove(f)

								        except OSError as e:

								            pass


								def main (args):

								    p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")

								    p.add_argument("padid", nargs="*", default=[])

								    p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")

								    p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")

								    p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p")

								    p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")

								    p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")

								    p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")

								    p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False")

								    p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")

								    p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")

								    p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")

								    p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")

								    p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")

								    p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")


								    p.add_argument("--nopublish", default="__NOPUBLISH__", help="no publish magic word, default: __NOPUBLISH__")


								    args = p.parse_args(args)


								    info = loadpadinfo(args.padinfo)

								    data = {}

								    data['apikey'] = info['apikey']


								    if args.padid:

								        padids = args.padid

								    else:

								        padids = getjson(info['apiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']

								    padids.sort()

								    numpads = len(padids)

								    # maxmsglen = 0

								    count = 0

								    for i, padid in enumerate(padids):

								        if args.skip != None and i<args.skip:

								            continue

								        progressbar(i, numpads, padid)


								        data['padID'] = padid.encode("utf-8")

								        p = padpath(padid, args.pub, args.group)

								        if args.folder:

								            p = os.path.join(p, padid.encode("utf-8"))


								        metapath = p + ".meta.json"

								        revisions = None

								        tries = 1

								        skip = False

								        padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])

								        meta = {}

								        if type(padurlbase) == unicode:

								            padurlbase = padurlbase.encode("utf-8")

								        while True:

								            try:

								                if os.path.exists(metapath):

								                    with open(metapath) as f:

								                        meta.update(json.load(f))

								                    revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']

								                    if meta['revisions'] == revisions and not args.force:

								                        skip=True

								                        break


								                meta['padid'] = padid.encode("utf-8")

								                versions = meta["versions"] = []

								                versions.append({

								                    "url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;

								                    "type": "pad",

								                    "code": 200

								                })


								                if revisions == None:

								                    meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']

								                else:

								                    meta['revisions' ] = revisions


								                if (meta['revisions'] == 0) and (not args.zerorevs):

								                    # print("Skipping zero revs", file=sys.stderr)

								                    skip=True

								                    break


								                # todo: load more metadata!

								                meta['group'], meta['pad'] = splitpadname(padid)

								                meta['pathbase'] = p

								                meta['lastedited_raw'] = int(getjson(info['apiurl']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])

								                meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()

								                meta['author_ids'] = getjson(info['apiurl']+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']

								                break

								            except HTTPError as e:

								                tries += 1

								                if tries > 3:

								                    print ("Too many failures ({0}), skipping".format(padid).encode("utf-8"), file=sys.stderr)

								                    skip=True

								                    break

								                else:

								                    sleep(3)


								        if skip:

								            continue


								        count += 1


								        if args.output:

								            print (padid.encode("utf-8"))


								        if args.all or (args.meta or args.text or args.html or args.dhtml):

								            try:

								                os.makedirs(os.path.split(metapath)[0])

								            except OSError:

								                pass


								        if args.all or args.text:

								            text = getjson(info['apiurl']+'getText?'+urlencode(data))

								            ver = {"type": "text"}

								            versions.append(ver)

								            ver["code"] = text["_code"]

								            if text["_code"] == 200:

								                text = text['data']['text']


								                ##########################################

								                ## ENFORCE __NOPUBLISH__ MAGIC WORD

								                ##########################################

								                if args.nopublish and args.nopublish in text:

								                    # NEED TO PURGE ANY EXISTING DOCS

								                    try_deleting((p+".raw.txt",p+".raw.html",p+".diff.html",p+".meta.json"))

								                    continue


								                ver["path"] = p+".raw.txt"

								                ver["url"] = quote(ver["path"])

								                with open(ver["path"], "w") as f:

								                    f.write(text.encode("utf-8"))

								                # once the content is settled, compute a hash

								                # and link it in the metadata!


								        links = []

								        links.append({"href":"/styles.css", "rel":"stylesheet"})

								        # todo, make this process reflect which files actually were made

								        versionbaseurl = quote(padid.encode("utf-8"))

								        links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})

								        links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"})

								        links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})

								        links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})

								        links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})

								        links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})


								        if args.all or args.dhtml:

								            data['startRev'] = "0"

								            html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))

								            ver = {"type": "diffhtml"}

								            versions.append(ver)

								            ver["code"] = html["_code"]

								            if html["_code"] == 200:

								                html = html['data']['html']

								                ver["path"] = p+".diff.html"

								                ver["url"] = quote(ver["path"])

								                doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)

								                html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)

								                with open(ver["path"], "w") as f:

								                    # f.write(html.encode("utf-8"))

								                    print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)


								        # Process text, html, dhtml, all options

								        if args.all or args.html:

								            html = getjson(info['apiurl']+'getHTML?'+urlencode(data))

								            ver = {"type": "html"}

								            versions.append(ver)

								            ver["code"] = html["_code"]

								            if html["_code"] == 200:

								                html = html['data']['html']

								                ver["path"] = p+".raw.html"

								                ver["url"] = quote(ver["path"])

								                doc = html5lib.parse(html, namespaceHTMLElements=False)

								                html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)

								                with open(ver["path"], "w") as f:

								                    # f.write(html.encode("utf-8"))

								                    print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)


								        # output meta

								        if args.all or args.meta:

								            ver = {"type": "meta"}

								            versions.append(ver)

								            ver["path"] = metapath

								            ver["url"] = quote(metapath)

								            with open(metapath, "w") as f:

								                json.dump(meta, f, indent=2)


								    print("\n{0} pad(s) loaded".format(count), file=sys.stderr)