etherpump/etherdump
2015-03-05 16:04:59 +01:00

426 lines
15 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
PADINFO_DEFAULTS = {
"hostname": "",
"apiversion": "1.2.9",
"apiurl": "/api/"
}
MODULE_PATH = (os.path.dirname(__file__))
TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates")
verbose = False
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode(data)))['data']['lastEdited']
return datetime.fromtimestamp(int(raw)/1000)
def getPadURL (self, padID):
group, name = pad_split_group(padID)
if group:
return self.protocol+"://"+self.hostname+"/p/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
def get_template_env (tpath=None):
import jinja2
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None):
template_env = get_template_env(templates)
pad_template = template_env.get_template("pad.html")
numpads = len(padids)
for i, padid in enumerate(padids):
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(group_path)
except OSError:
pass
try:
os.mkdir(os.path.join(group_path, group_id))
except OSError:
pass
fp = os.path.join(outputpath, group_path, group_id, pad_name)
else:
try:
os.mkdir(pub_path)
except OSError:
pass
fp = os.path.join(outputpath, pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
else:
sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads))
sys.stderr.flush()
textpath = fp + ".txt"
htmlpath = fp+".html"
metapath = fp+".json"
last_edited = padserver.getPadLastEdited(padid).isoformat()
if os.path.exists(metapath):
with open(metapath) as f:
meta = json.load(f)
if not force and meta.get("last_edited") == last_edited:
if verbose:
print("Up to date, skipping", file=sys.stderr)
continue
meta = {
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
}
meta['last_edited'] = last_edited
# Write Text
with open(textpath, "w") as f:
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = os.path.relpath(textpath, outputpath)
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
meta['html_path'] = os.path.relpath(htmlpath, outputpath)
meta['html_length'] = len(html)
if pad_template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid)
meta['url'] = editurl
json_dump = json.dumps(meta)
f.write(pad_template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json_dump).encode("utf-8")) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(metapath, "w") as f:
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
if not verbose:
sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads))
sys.stderr.flush()
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# command
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
# padinfo
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates')
# listpads/groups-specific
parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON')
# dump-specific
parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date')
parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# index-specific
parser.add_argument('--title', default="etherpad index &amp; archive", help='(index) title')
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file')
parser.add_argument('--output', default=None, help='(index) path for output (default stdout)')
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
if verbose:
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
start = time.time()
padids = padserver.listAllPads()
if args.skip:
padids = padids[args.skip:]
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.outputpath,
args.pubpath,
args.grouppath,
force=args.force,
templates=args.templates)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "index":
def augment_info(info, groupinfo):
if info.get("last_edited") != None:
dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" )
info['last_edited_parsed'] = dt
info['last_edited_str'] = str(dt)
if groupinfo:
gid = info.get("group_id")
if gid.startswith("g."):
gid = gid[2:]
if gid in groupinfo:
info[u"group_name"] = groupinfo[gid].get("name")
# print (info, file=sys.stderr)
return info
def get_pads(groupinfo=None):
pads = padids_from_path(args.pubpath)
pads = [augment_info(x, groupinfo) for x in pads]
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
if not args.exclude_groups and os.path.exists(args.grouppath):
groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pad_infos = padids_from_path(gp)
pad_infos = [augment_info(x, groupinfo) for x in pad_infos]
pads.extend(pad_infos)
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(args.output, "w")
import jinja2
env = get_template_env(args.templates)
index_template = env.get_template("index.html")
out.write(index_template.render(
pads = pads,
title = args.title,
timestamp = datetime.now()
).encode("utf-8"))
if args.output:
out.close()
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)