etherpump/etherdump/commands/pull.py
Michael Murtaugh 111ab6bfde merge
2016-01-08 12:10:58 +01:00

189 lines
7.6 KiB
Python

#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import sys, json, re, os
from datetime import datetime
from urllib import urlencode, quote
from urllib2 import HTTPError
from common import *
from time import sleep
"""
pull(meta):
Update meta data files for those that have changed.
Check for changed pads by looking at revisions & comparing to existing
todo...
use/prefer public interfaces ? (export functions)
"""
def main (args):
p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")
p.add_argument("padid", nargs="*", default=[])
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p")
p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False")
p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")
p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
args = p.parse_args(args)
info = loadpadinfo(args.padinfo)
data = {}
data['apikey'] = info['apikey']
if args.padid:
padids = args.padid
else:
padids = getjson(info['apiurl']+'listAllPads?'+urlencode(data))['data']['padIDs']
padids.sort()
numpads = len(padids)
# maxmsglen = 0
count = 0
for i, padid in enumerate(padids):
# TODO...
"""
Self-containted documents / and/or document receipts
storing enough information to reconstruct (or understand an error occurred)
"""
if args.skip != None and i<args.skip:
continue
progressbar(i, numpads, padid)
data['padID'] = padid.encode("utf-8")
p = padpath(padid, args.pub, args.group)
if args.folder:
p = os.path.join(p, padid.encode("utf-8"))
metapath = p + ".meta.json"
revisions = None
tries = 1
skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
if type(padurlbase) == unicode:
padurlbase = padurlbase.encode("utf-8")
while True:
try:
if os.path.exists(metapath):
with open(metapath) as f:
meta.update(json.load(f))
revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
if meta['revisions'] == revisions and not args.force:
skip=True
break
## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS
## (or else in surrounding meta data!!)
meta['padid'] = padid.encode("utf-8")
versions = meta["versions"] = []
versions.append({
"url": padurlbase + padid.encode("utf-8"),
"type": "pad",
"code": 200
})
if revisions == None:
meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
else:
meta['revisions' ] = revisions
if (meta['revisions'] == 0) and (not args.zerorevs):
# print("Skipping zero revs", file=sys.stderr)
skip=True
break
# todo: load more metadata!
meta['group'], meta['pad'] = splitpadname(padid)
meta['pathbase'] = p
meta['lastedited_raw'] = int(getjson(info['apiurl']+'getLastEdited?'+urlencode(data))['data']['lastEdited'])
meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat()
meta['author_ids'] = getjson(info['apiurl']+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
break
except HTTPError as e:
tries += 1
if tries > 3:
print ("Too many failures ({0}), skipping".format(padid).encode("utf-8"), file=sys.stderr)
skip=True
break
else:
sleep(3)
if skip:
continue
count += 1
if args.output:
print (padid.encode("utf-8"))
if args.all or (args.meta or args.text or args.html or args.dhtml):
try:
os.makedirs(os.path.split(metapath)[0])
except OSError:
pass
# Process text, html, dhtml, all options
if args.all or args.html:
html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(html.encode("utf-8"))
if args.all or args.text:
text = getjson(info['apiurl']+'getText?'+urlencode(data))
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text['data']['text']
ver["path"] = p+".raw.txt"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(text.encode("utf-8"))
# once the content is settled, compute a hash
# and link it in the metadata!
if args.all or args.dhtml:
data['startRev'] = "0"
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".diff.html"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(html.encode("utf-8"))
# output meta
if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
with open(metapath, "w") as f:
json.dump(meta, f, indent=2)
print("\n{0} pad(s) loaded".format(count), file=sys.stderr)