etherpump/etherdump/commands/pull.py

#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import sys, json, re, os
from datetime import datetime
from urllib import urlencode, quote
from urllib2 import HTTPError
from common import *
from time import sleep
from html5tidy import html5tidy
import html5lib
from xml.etree import ElementTree as ET 
# debugging
# import ElementTree as ET 

"""
pull(meta):
    Update meta data files for those that have changed.
    Check for changed pads by looking at revisions & comparing to existing


todo...
use/prefer public interfaces ? (export functions)


"""

def main (args):
    p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")
    p.add_argument("padid", nargs="*", default=[])
    p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
    p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
    p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p")
    p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
    p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
    p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
    p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False")
    p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
    p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")
    p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
    p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
    p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
    p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
    args = p.parse_args(args)

    info = loadpadinfo(args.padinfo)
    data = {}
    data['apikey'] = info['apikey']

    if args.padid:
        padids = args.padid 
    else:
        padids = getjson(info['apiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']
    padids.sort()
    numpads = len(padids)
    # maxmsglen = 0
    count = 0
    for i, padid in enumerate(padids):
        # TODO...
        """        
Self-contained documents / and/or document receipts
storing enough information to reconstruct (or understand an error occurred)
    """


        if args.skip != None and i<args.skip:
            continue
        progressbar(i, numpads, padid)
        
        data['padID'] = padid.encode("utf-8")
        p = padpath(padid, args.pub, args.group)
        if args.folder:
            p = os.path.join(p, padid.encode("utf-8"))

        metapath = p + ".meta.json"
        revisions = None
        tries = 1
        skip = False
        padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
        meta = {}
        if type(padurlbase) == unicode:
            padurlbase = padurlbase.encode("utf-8")
        while True:
            try:
                if os.path.exists(metapath):
                    with open(metapath) as f:
                        meta.update(json.load(f))
                    revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
                    if meta['revisions'] == revisions and not args.force:
                        skip=True
                        break
                
                ## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS
                ## (or else in surrounding meta data!!)
                meta['padid'] = padid.encode("utf-8")
                versions = meta["versions"] = []
                versions.append({
                    "url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;
                    "type": "pad",
                    "code": 200
                })

                if revisions == None:
                    meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
                else:
                    meta['revisions' ] = revisions            

                if (meta['revisions'] == 0) and (not args.zerorevs):
                    # print("Skipping zero revs", file=sys.stderr)
                    skip=True
                    break

                # todo: load more metadata!
                meta['group'], meta['pad'] = splitpadname(padid)
                meta['pathbase'] = p
                meta['lastedited_raw'] = int(getjson(info['apiurl']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])
                meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()
                meta['author_ids'] = getjson(info['apiurl']+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
                break
            except HTTPError as e:
                tries += 1
                if tries > 3:
                    print ("Too many failures ({0}), skipping".format(padid).encode("utf-8"), file=sys.stderr)
                    skip=True
                    break
                else:
                    sleep(3)

        if skip:
            continue

        count += 1

        if args.output:
            print (padid.encode("utf-8"))

        if args.all or (args.meta or args.text or args.html or args.dhtml):
            try:
                os.makedirs(os.path.split(metapath)[0])
            except OSError:
                pass

        if args.all or args.text:
            text = getjson(info['apiurl']+'getText?'+urlencode(data))
            ver = {"type": "text"}
            versions.append(ver)
            ver["code"] = text["_code"]
            if text["_code"] == 200:
                text = text['data']['text']
                ver["path"] = p+".raw.txt"
                ver["url"] = quote(ver["path"])
                with open(ver["path"], "w") as f:
                    f.write(text.encode("utf-8"))
                # once the content is settled, compute a hash
                # and link it in the metadata!

        links = []
        links.append({"href":"/styles.css", "rel":"stylesheet"})
        # todo, make this process reflect which files actually were made
        versionbaseurl = quote(padid.encode("utf-8"))
        links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})
        links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"})
        links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})
        links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})
        links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})
        links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})

        if args.all or args.dhtml:
            data['startRev'] = "0"
            html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
            ver = {"type": "diffhtml"}
            versions.append(ver)
            ver["code"] = html["_code"] 
            if html["_code"] == 200:
                html = html['data']['html']
                ver["path"] = p+".diff.html"
                ver["url"] = quote(ver["path"])
                doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)
                html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)
                with open(ver["path"], "w") as f:
                    # f.write(html.encode("utf-8"))
                    print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)

        # Process text, html, dhtml, all options
        if args.all or args.html:
            html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
            ver = {"type": "html"}
            versions.append(ver)
            ver["code"] = html["_code"]
            if html["_code"] == 200:
                html = html['data']['html']
                ver["path"] = p+".raw.html"
                ver["url"] = quote(ver["path"])
                doc = html5lib.parse(html, namespaceHTMLElements=False)
                html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)
                with open(ver["path"], "w") as f:
                    # f.write(html.encode("utf-8"))
                    print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)

        # output meta
        if args.all or args.meta:
            ver = {"type": "meta"}
            versions.append(ver)
            ver["path"] = metapath
            ver["url"] = quote(metapath)
            with open(metapath, "w") as f:
                json.dump(meta, f, indent=2)

    print("\n{0} pad(s) loaded".format(count), file=sys.stderr)
make file friendliness 9 years ago			`#!/usr/bin/env python`
			`from __future__ import print_function`
			`from argparse import ArgumentParser`
			`import sys, json, re, os`
			`from datetime import datetime`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`from urllib import urlencode, quote`
changes 9 years ago			`from urllib2 import HTTPError`
make file friendliness 9 years ago			`from common import *`
more retries in pull, join to move into folders 9 years ago			`from time import sleep`
pull with html5tidy and version links 9 years ago			`from html5tidy import html5tidy`
			`import html5lib`
			`from xml.etree import ElementTree as ET`
fixed vital encoding error in pad urls 9 years ago			`# debugging`
			`# import ElementTree as ET`
make file friendliness 9 years ago
			`"""`
changes 9 years ago			`pull(meta):`
make file friendliness 9 years ago			`Update meta data files for those that have changed.`
			`Check for changed pads by looking at revisions & comparing to existing`

new 9 years ago
			`todo...`
			`use/prefer public interfaces ? (export functions)`


make file friendliness 9 years ago			`"""`

			`def main (args):`
updated help strings of commands 9 years ago			`p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")`
added padids for fine-grained syn 9 years ago			`p.add_argument("padid", nargs="*", default=[])`
new init + settings location 9 years ago			`p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")`
updated help strings of commands 9 years ago			`p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")`
merge 9 years ago			`p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p")`
updated help strings of commands 9 years ago			`p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")`
make file friendliness 9 years ago			`p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")`
updated help strings of commands 9 years ago			`p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")`
			`p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False")`
			`p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")`
			`p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")`
self contained sync command with per output options and all flag 9 years ago			`p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")`
new 9 years ago			`p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")`
			`p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")`
make file friendliness 9 years ago			`args = p.parse_args(args)`

changes 9 years ago			`info = loadpadinfo(args.padinfo)`
make file friendliness 9 years ago			`data = {}`
			`data['apikey'] = info['apikey']`
added padids for fine-grained syn 9 years ago
			`if args.padid:`
			`padids = args.padid`
			`else:`
changes 9 years ago			`padids = getjson(info['apiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']`
make file friendliness 9 years ago			`padids.sort()`
			`numpads = len(padids)`
self contained sync command with per output options and all flag 9 years ago			`# maxmsglen = 0`
make file friendliness 9 years ago			`count = 0`
			`for i, padid in enumerate(padids):`
new 9 years ago			`# TODO...`
			`"""`
pull with html5tidy and version links 9 years ago			`Self-contained documents / and/or document receipts`
new 9 years ago			`storing enough information to reconstruct (or understand an error occurred)`
			`"""`


make file friendliness 9 years ago			`if args.skip != None and i<args.skip:`
			`continue`
changes 9 years ago			`progressbar(i, numpads, padid)`
self contained sync command with per output options and all flag 9 years ago
make file friendliness 9 years ago			`data['padID'] = padid.encode("utf-8")`
			`p = padpath(padid, args.pub, args.group)`
new 9 years ago			`if args.folder:`
			`p = os.path.join(p, padid.encode("utf-8"))`

make file friendliness 9 years ago			`metapath = p + ".meta.json"`
			`revisions = None`
self contained sync command with per output options and all flag 9 years ago			`tries = 1`
			`skip = False`
new 9 years ago			`padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`meta = {}`
new 9 years ago			`if type(padurlbase) == unicode:`
			`padurlbase = padurlbase.encode("utf-8")`
self contained sync command with per output options and all flag 9 years ago			`while True:`
			`try:`
			`if os.path.exists(metapath):`
			`with open(metapath) as f:`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`meta.update(json.load(f))`
changes 9 years ago			`revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']`
new 9 years ago			`if meta['revisions'] == revisions and not args.force:`
self contained sync command with per output options and all flag 9 years ago			`skip=True`
			`break`

new 9 years ago			`## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS`
			`## (or else in surrounding meta data!!)`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`meta['padid'] = padid.encode("utf-8")`
			`versions = meta["versions"] = []`
			`versions.append({`
fixed vital encoding error in pad urls 9 years ago			`"url": padurlbase + quote(padid.encode("utf-8")), # this quote was really important for dealing with rogue chars like \xa0 in a padid;`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`"type": "pad",`
			`"code": 200`
			`})`
new 9 years ago
self contained sync command with per output options and all flag 9 years ago			`if revisions == None:`
changes 9 years ago			`meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']`
self contained sync command with per output options and all flag 9 years ago			`else:`
			`meta['revisions' ] = revisions`

			`if (meta['revisions'] == 0) and (not args.zerorevs):`
			`# print("Skipping zero revs", file=sys.stderr)`
			`skip=True`
			`break`

			`# todo: load more metadata!`
new 9 years ago			`meta['group'], meta['pad'] = splitpadname(padid)`
self contained sync command with per output options and all flag 9 years ago			`meta['pathbase'] = p`
changes 9 years ago			`meta['lastedited_raw'] = int(getjson(info['apiurl']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])`
self contained sync command with per output options and all flag 9 years ago			`meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()`
changes 9 years ago			`meta['author_ids'] = getjson(info['apiurl']+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']`
self contained sync command with per output options and all flag 9 years ago			`break`
			`except HTTPError as e:`
			`tries += 1`
			`if tries > 3:`
			`print ("Too many failures ({0}), skipping".format(padid).encode("utf-8"), file=sys.stderr)`
			`skip=True`
			`break`
new 9 years ago			`else:`
			`sleep(3)`
self contained sync command with per output options and all flag 9 years ago
			`if skip:`
make file friendliness 9 years ago			`continue`

			`count += 1`
self contained sync command with per output options and all flag 9 years ago
new 9 years ago			`if args.output:`
			`print (padid.encode("utf-8"))`
self contained sync command with per output options and all flag 9 years ago
			`if args.all or (args.meta or args.text or args.html or args.dhtml):`
			`try:`
			`os.makedirs(os.path.split(metapath)[0])`
			`except OSError:`
			`pass`

new pull, new meta style from live constant etherdumpÄ 9 years ago			`if args.all or args.text:`
			`text = getjson(info['apiurl']+'getText?'+urlencode(data))`
			`ver = {"type": "text"}`
			`versions.append(ver)`
			`ver["code"] = text["_code"]`
			`if text["_code"] == 200:`
			`text = text['data']['text']`
			`ver["path"] = p+".raw.txt"`
			`ver["url"] = quote(ver["path"])`
			`with open(ver["path"], "w") as f:`
			`f.write(text.encode("utf-8"))`
			`# once the content is settled, compute a hash`
			`# and link it in the metadata!`
self contained sync command with per output options and all flag 9 years ago
pull with html5tidy and version links 9 years ago			`links = []`
adjusted URLs to be absroot 9 years ago			`links.append({"href":"/styles.css", "rel":"stylesheet"})`
pull with html5tidy and version links 9 years ago			`# todo, make this process reflect which files actually were made`
			`versionbaseurl = quote(padid.encode("utf-8"))`
			`links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"})`
			`links.append({"href":versionbaseurl+".raw.txt", "rel":"alternate", "type":"text/plain", "title":"Plain text"})`
			`links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"})`
			`links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"})`
			`links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"})`
adjusted URLs to be absroot 9 years ago			`links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})`
pull with html5tidy and version links 9 years ago
self contained sync command with per output options and all flag 9 years ago			`if args.all or args.dhtml:`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`data['startRev'] = "0"`
			`html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))`
			`ver = {"type": "diffhtml"}`
			`versions.append(ver)`
			`ver["code"] = html["_code"]`
			`if html["_code"] == 200:`
			`html = html['data']['html']`
			`ver["path"] = p+".diff.html"`
			`ver["url"] = quote(ver["path"])`
fixed vital encoding error in pad urls 9 years ago			`doc = html5lib.parse(html.encode("utf-8"), encoding="utf-8", namespaceHTMLElements=False)`
adjusted URLs to be absroot 9 years ago			`html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)`
pull with html5tidy and version links 9 years ago			`with open(ver["path"], "w") as f:`
			`# f.write(html.encode("utf-8"))`
fixed vital encoding error in pad urls 9 years ago			`print(ET.tostring(doc, method="html", encoding="utf-8"), file=f)`
pull with html5tidy and version links 9 years ago
			`# Process text, html, dhtml, all options`
			`if args.all or args.html:`
			`html = getjson(info['apiurl']+'getHTML?'+urlencode(data))`
			`ver = {"type": "html"}`
			`versions.append(ver)`
			`ver["code"] = html["_code"]`
			`if html["_code"] == 200:`
			`html = html['data']['html']`
			`ver["path"] = p+".raw.html"`
			`ver["url"] = quote(ver["path"])`
			`doc = html5lib.parse(html, namespaceHTMLElements=False)`
adjusted URLs to be absroot 9 years ago			`html5tidy(doc, indent=True, title=padid, scripts="/versions.js", links=links)`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`with open(ver["path"], "w") as f:`
pull with html5tidy and version links 9 years ago			`# f.write(html.encode("utf-8"))`
fixed vital encoding error in pad urls 9 years ago			`print (ET.tostring(doc, method="html", encoding="utf-8"), file=f)`
new pull, new meta style from live constant etherdumpÄ 9 years ago
			`# output meta`
			`if args.all or args.meta:`
			`ver = {"type": "meta"}`
			`versions.append(ver)`
			`ver["path"] = metapath`
			`ver["url"] = quote(metapath)`
			`with open(metapath, "w") as f:`
			`json.dump(meta, f, indent=2)`
self contained sync command with per output options and all flag 9 years ago
new 9 years ago			`print("\n{0} pad(s) loaded".format(count), file=sys.stderr)`