diff --git a/README.md b/README.md index 36d82f1..22ebd71 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,11 @@ subcommands To get help on a subcommand: etherdump revisionscount --help + +TODO +-------- +* Modify tools to work with make +** Sync command +** Dump command that works on a single page +** Post processing as separable filters (such as linkify) +* Support for migrating (what dump formats exist that would allow pushing to another instance?) diff --git a/etherdump/commands/common.py b/etherdump/commands/common.py new file mode 100644 index 0000000..b8d6194 --- /dev/null +++ b/etherdump/commands/common.py @@ -0,0 +1,25 @@ +import re, os +from urllib import quote_plus, unquote_plus + + +groupnamepat = re.compile(r"^g\.(\w+)\$") +def splitpadname (padid): + m = groupnamepat.match(padid) + if m: + return(m.group(1), padid[m.end():]) + else: + return (u"", padid) + +def padpath (padid, pub_path=u"", group_path=u""): + g, p = splitpadname(padid) + if type(g) == unicode: + g = g.encode("utf-8") + if type(p) == unicode: + p = p.encode("utf-8") + p = quote_plus(p) + # p = p.replace(" ", "_") + # p = p.replace("*", "-") + if g: + return os.path.join(group_path, g, p) + else: + return os.path.join(pub_path, p) diff --git a/etherdump/commands/creatediffhtml.py b/etherdump/commands/creatediffhtml.py new file mode 100644 index 0000000..66be578 --- /dev/null +++ b/etherdump/commands/creatediffhtml.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import json +from urllib import urlencode +from urllib2 import urlopen, HTTPError, URLError + + +def main(args): + p = ArgumentParser("") + p.add_argument("padid", help="the padid") + p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") + p.add_argument("--showurl", default=False, action="store_true") + p.add_argument("--format", default="text", help="output format, can be: text, json; default: text") + p.add_argument("--rev", type=int, default=None, help="revision, default: latest") + args = p.parse_args(args) + + with open(args.padinfo) as f: + info = json.load(f) + apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) + data = {} + data['apikey'] = info['apikey'] + data['padID'] = args.padid + data['startRev'] = "0" + if args.rev != None: + data['rev'] = args.rev + requesturl = apiurl+'createDiffHTML?'+urlencode(data) + if args.showurl: + print requesturl + else: + try: + results = json.load(urlopen(requesturl))['data'] + if args.format == "json": + print json.dumps(results) + else: + print results['html'].encode("utf-8") + except HTTPError as e: + pass \ No newline at end of file diff --git a/etherdump/commands/dumpcsv.py b/etherdump/commands/dumpcsv.py index d390b94..4343ecb 100644 --- a/etherdump/commands/dumpcsv.py +++ b/etherdump/commands/dumpcsv.py @@ -6,6 +6,7 @@ from datetime import datetime from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from csv import writer +from math import ceil, floor """ Dumps a CSV of all pads with columns @@ -32,6 +33,7 @@ def main (args): p = ArgumentParser("") p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") p.add_argument("--format", default="csv", help="output format: csv (default), json") + p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False") args = p.parse_args(args) with open(args.padinfo) as f: @@ -41,11 +43,23 @@ def main (args): data['apikey'] = info['apikey'] requesturl = apiurl+'listAllPads?'+urlencode(data) - results = jsonload(requesturl)['data']['padIDs'] - results.sort() + padids = jsonload(requesturl)['data']['padIDs'] + padids.sort() + numpads = len(padids) + maxmsglen = 0 + count = 0 out.writerow(("padid", "groupid", "lastedited", "revisions", "author_ids")) - for padid in results: - print (u"{0}".format(padid), file=sys.stderr) + for i, padid in enumerate(padids): + p = (float(i) / numpads) + percentage = int(floor(p*100)) + bars = int(ceil(p*20)) + bar = ("*"*bars) + ("-"*(20-bars)) + msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), numpads, padid) + if len(msg) > maxmsglen: + maxmsglen = len(msg) + sys.stderr.write("\r{0}".format(" "*maxmsglen)) + sys.stderr.write(msg.encode("utf-8")) + sys.stderr.flush() m = groupnamepat.match(padid) if m: groupname = m.group(1) @@ -56,10 +70,16 @@ def main (args): data['padID'] = padid.encode("utf-8") revisions = jsonload(apiurl+'getRevisionsCount?'+urlencode(data))['data']['revisions'] + if (revisions == 0) and not args.zerorevs: + continue + + lastedited_raw = jsonload(apiurl+'getLastEdited?'+urlencode(data))['data']['lastEdited'] lastedited_iso = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat() author_ids = jsonload(apiurl+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs'] author_ids = u" ".join(author_ids).encode("utf-8") out.writerow((padidnogroup.encode("utf-8"), groupname.encode("utf-8"), revisions, lastedited_iso, author_ids)) + count += 1 + print("\nWrote {0} rows...".format(count), file=sys.stderr) diff --git a/etherdump/commands/gethtml.py b/etherdump/commands/gethtml.py new file mode 100644 index 0000000..419c629 --- /dev/null +++ b/etherdump/commands/gethtml.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import json +from urllib import urlencode +from urllib2 import urlopen, HTTPError, URLError + + +def main(args): + p = ArgumentParser("") + p.add_argument("padid", help="the padid") + p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") + p.add_argument("--showurl", default=False, action="store_true") + p.add_argument("--format", default="text", help="output format, can be: text, json; default: text") + p.add_argument("--rev", type=int, default=None, help="revision, default: latest") + args = p.parse_args(args) + + with open(args.padinfo) as f: + info = json.load(f) + apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) + data = {} + data['apikey'] = info['apikey'] + data['padID'] = args.padid + if args.rev != None: + data['rev'] = args.rev + requesturl = apiurl+'getHTML?'+urlencode(data) + if args.showurl: + print requesturl + else: + results = json.load(urlopen(requesturl))['data'] + if args.format == "json": + print json.dumps(results) + else: + print results['html'].encode("utf-8") diff --git a/etherdump/commands/text.py b/etherdump/commands/gettext.py similarity index 84% rename from etherdump/commands/text.py rename to etherdump/commands/gettext.py index c0f50dc..7e806e5 100644 --- a/etherdump/commands/text.py +++ b/etherdump/commands/gettext.py @@ -12,6 +12,7 @@ def main(args): p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") p.add_argument("--showurl", default=False, action="store_true") p.add_argument("--format", default="text", help="output format, can be: text, json; default: text") + p.add_argument("--rev", type=int, default=None, help="revision, default: latest") args = p.parse_args(args) with open(args.padinfo) as f: @@ -19,7 +20,9 @@ def main(args): apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) data = {} data['apikey'] = info['apikey'] - data['padID'] = args.padid.encode("utf-8") + data['padID'] = args.padid # is utf-8 encoded + if args.rev != None: + data['rev'] = args.rev requesturl = apiurl+'getText?'+urlencode(data) if args.showurl: print requesturl @@ -29,6 +32,3 @@ def main(args): print json.dumps(results) else: print results['text'].encode("utf-8") - - # To save to file run: - # python gettext.py > copy.txt diff --git a/etherdump/commands/showmeta.py b/etherdump/commands/showmeta.py new file mode 100644 index 0000000..9d0053a --- /dev/null +++ b/etherdump/commands/showmeta.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +from __future__ import print_function +from argparse import ArgumentParser +import json, sys, re +from common import * + +""" +Extract and output selected fields of metadata +""" + +def main (args): + p = ArgumentParser("") + p.add_argument("--path", default=None, help="read from a meta.json file") + p.add_argument("--padid", default=None, help="read meta for this padid") + p.add_argument("--format", default="{padid}", help="format str, default: {padid}") + args = p.parse_args(args) + + path = args.path + if not path and args.padid: + path = padpath(args.padid) + ".meta.json" + + if not path: + print ("Must specify either --path or --padid") + sys.exit(-1) + + with open(path) as f: + meta = json.load(f) + + formatstr = args.format.decode("utf-8") + formatstr = re.sub(ur"{(\w+)}", r"{0[\1]}", formatstr) + print (formatstr.format(meta).encode("utf-8")) + diff --git a/etherdump/commands/sync.py b/etherdump/commands/sync.py new file mode 100644 index 0000000..ba69f9c --- /dev/null +++ b/etherdump/commands/sync.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python +from __future__ import print_function +from argparse import ArgumentParser +import sys, json, re, os +from datetime import datetime +from urllib import urlencode +from urllib2 import urlopen, HTTPError, URLError +from math import ceil, floor +from common import * + +""" +sync(meta): + Update meta data files for those that have changed. + Check for changed pads by looking at revisions & comparing to existing + +""" + +def jsonload (url): + f = urlopen(url) + data = f.read() + f.close() + return json.loads(data) + +def load_padinfo(p): + with open(p) as f: + info = json.load(f) + info['api'] = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) + return info + + +def main (args): + p = ArgumentParser("") + p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") + p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False") + p.add_argument("--pub", default="pub", help="pub path for output, default: pub") + p.add_argument("--group", default="g", help="group path for output, default: g") + p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") + args = p.parse_args(args) + + info = load_padinfo(args.padinfo) + data = {} + data['apikey'] = info['apikey'] + padids = jsonload(info['api']+'listAllPads?'+urlencode(data))['data']['padIDs'] + padids.sort() + numpads = len(padids) + maxmsglen = 0 + count = 0 + for i, padid in enumerate(padids): + if args.skip != None and i maxmsglen: + maxmsglen = len(msg) + sys.stderr.write("\r{0}".format(" "*maxmsglen)) + sys.stderr.write(msg.encode("utf-8")) + sys.stderr.flush() + data['padID'] = padid.encode("utf-8") + p = padpath(padid, args.pub, args.group) + metapath = p + ".meta.json" + revisions = None + if os.path.exists(metapath): + with open(metapath) as f: + meta = json.load(f) + revisions = jsonload(info['api']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] + if meta['revisions'] == revisions: + continue + + meta = {'padid': padid.encode("utf-8")} + if revisions == None: + meta['revisions'] = jsonload(info['api']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] + else: + meta['revisions' ] = revisions + + if (meta['revisions'] == 0) and (not args.zerorevs): + # print("Skipping zero revs", file=sys.stderr) + continue + + count += 1 + # todo: load more metadata! + meta['lastedited_raw'] = int(jsonload(info['api']+'getLastEdited?'+urlencode(data))['data']['lastEdited']) + meta['lastedited_iso'] = datetime.fromtimestamp(int(meta['lastedited_raw'])/1000).isoformat() + meta['author_ids'] = jsonload(info['api'] +'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs'] + + # save it + try: + os.makedirs(os.path.split(metapath)[0]) + except OSError: + pass + with open(metapath, "w") as f: + json.dump(meta, f) + + print("\nWrote {0} files...".format(count), file=sys.stderr)