etherpump/etherdump/commands/dumpcsv.py
2015-11-22 21:59:52 +01:00

85 lines
2.9 KiB
Python

#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import sys, json, re
from datetime import datetime
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from csv import writer
from math import ceil, floor
"""
Dumps a CSV of all pads with columns
padid, groupid, revisions, lastedited, author_ids
padids have their group name trimmed
groupid is without (g. $)
revisions is an integral number of edits
lastedited is ISO8601 formatted
author_ids is a space delimited list of internal author IDs
"""
groupnamepat = re.compile(r"^g\.(\w+)\$")
out = writer(sys.stdout)
def jsonload (url):
f = urlopen(url)
data = f.read()
f.close()
return json.loads(data)
def main (args):
p = ArgumentParser("outputs a CSV of information all all pads")
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False")
args = p.parse_args(args)
with open(args.padinfo) as f:
info = json.load(f)
apiurl = info.get("apiurl")
data = {}
data['apikey'] = info['apikey']
requesturl = apiurl+'listAllPads?'+urlencode(data)
padids = jsonload(requesturl)['data']['padIDs']
padids.sort()
numpads = len(padids)
maxmsglen = 0
count = 0
out.writerow(("padid", "groupid", "lastedited", "revisions", "author_ids"))
for i, padid in enumerate(padids):
p = (float(i) / numpads)
percentage = int(floor(p*100))
bars = int(ceil(p*20))
bar = ("*"*bars) + ("-"*(20-bars))
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), numpads, padid)
if len(msg) > maxmsglen:
maxmsglen = len(msg)
sys.stderr.write("\r{0}".format(" "*maxmsglen))
sys.stderr.write(msg.encode("utf-8"))
sys.stderr.flush()
m = groupnamepat.match(padid)
if m:
groupname = m.group(1)
padidnogroup = padid[m.end():]
else:
groupname = u""
padidnogroup = padid
data['padID'] = padid.encode("utf-8")
revisions = jsonload(apiurl+'getRevisionsCount?'+urlencode(data))['data']['revisions']
if (revisions == 0) and not args.zerorevs:
continue
lastedited_raw = jsonload(apiurl+'getLastEdited?'+urlencode(data))['data']['lastEdited']
lastedited_iso = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
author_ids = jsonload(apiurl+'listAuthorsOfPad?'+urlencode(data))['data']['authorIDs']
author_ids = u" ".join(author_ids).encode("utf-8")
out.writerow((padidnogroup.encode("utf-8"), groupname.encode("utf-8"), revisions, lastedited_iso, author_ids))
count += 1
print("\nWrote {0} rows...".format(count), file=sys.stderr)