initial commit

This commit is contained in:
Michael Murtaugh 2015-02-26 13:54:26 +01:00
commit 771d76f67c
3 changed files with 214 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
venv/
sites/
*.pyc
*~

4
README.md Normal file
View File

@ -0,0 +1,4 @@
etherdump
=========
Tool to make archival dumps of etherpad pages.

206
etherdump Executable file
View File

@ -0,0 +1,206 @@
#!/usr/bin/python
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from urllib import urlencode
from urlparse import urljoin
PADINFO_DEFAULTS = {
"hostname": "",
"apiversion": "1.2.9",
"apiurl": "/api/"
}
verbose = False
def listAllPads (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllPads?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['padIDs']
def listAllGroups (apiURL, apikey):
data = {'apikey': apikey}
url = apiURL+'listAllGroups?'+urlencode(data)
if verbose:
print (url, file=sys.stderr)
resp = json.load(urlopen(url))
return resp['data']['groupIDs']
def getPadText (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data)))
return resp['data']['text']
def getPadHTML (padID, apiURL, apikey):
data = {'apikey': apikey, 'padID': padID}
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data)))
return resp['data']['html']
def getPadLastEdited (padID, apiURL, apikey):
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID})))
return r['data']['lastEdited']
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False):
for padid in padids:
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(group_path)
except OSError:
pass
try:
os.mkdir(os.path.join(group_path, group_id))
except OSError:
pass
fp = os.path.join(group_path, group_id, pad_name)
else:
try:
os.mkdir(pub_path)
except OSError:
pass
fp = os.path.join(pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
if skip_existing:
if os.path.exists(fp+".json"):
continue
# Write Metadata
meta = {
'padid': padid,
'groupID': group_id,
'padname': pad_name
}
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
try:
resp = json.load(urlopen(url))
meta['lastEdited'] = resp['data']['lastEdited']
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
# Write Text
with open(fp+".utf8.txt", "w") as f:
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
try:
resp = json.load(urlopen(url))
text = resp['data']['text'].encode("utf-8")
f.write(text)
meta['text_length'] = len(text)
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+ ".utf8.html", "w") as f:
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
try:
resp = json.load(urlopen(url))
text = resp['data']['html'].encode("utf-8")
f.write(text)
meta['html_length'] = len(text)
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(fp+".json", "w") as f:
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--human', default=False, action="store_true", help='output for reading')
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
# Construct the base API URL
apiurl = "http://" + padinfo.get("hostname")
if padinfo.get("port"):
apiurl += ":{0}".format(padinfo['port'])
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion'])
apikey = padinfo.get("apikey")
if verbose:
print ("Connecting to {0}".format(apiurl), file=sys.stderr)
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = listAllPads(apiurl, apikey)
if not args.human:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = listAllGroups(apiurl, apikey)
if not args.human:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
start = time.time()
padids = listAllPads(apiurl, apikey)
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
else:
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)