Michael Murtaugh
10 years ago
commit
771d76f67c
3 changed files with 214 additions and 0 deletions
@ -0,0 +1,4 @@ |
|||||
|
venv/ |
||||
|
sites/ |
||||
|
*.pyc |
||||
|
*~ |
@ -0,0 +1,4 @@ |
|||||
|
etherdump |
||||
|
========= |
||||
|
|
||||
|
Tool to make archival dumps of etherpad pages. |
@ -0,0 +1,206 @@ |
|||||
|
#!/usr/bin/python |
||||
|
|
||||
|
from __future__ import print_function |
||||
|
import sys, argparse, json, re, os, time |
||||
|
from urllib2 import urlopen, HTTPError, URLError |
||||
|
import html5lib, urllib2, urllib |
||||
|
from urllib import urlencode |
||||
|
from urlparse import urljoin |
||||
|
|
||||
|
PADINFO_DEFAULTS = { |
||||
|
"hostname": "", |
||||
|
"apiversion": "1.2.9", |
||||
|
"apiurl": "/api/" |
||||
|
} |
||||
|
verbose = False |
||||
|
|
||||
|
def listAllPads (apiURL, apikey): |
||||
|
data = {'apikey': apikey} |
||||
|
url = apiURL+'listAllPads?'+urlencode(data) |
||||
|
if verbose: |
||||
|
print (url, file=sys.stderr) |
||||
|
resp = json.load(urlopen(url)) |
||||
|
return resp['data']['padIDs'] |
||||
|
|
||||
|
def listAllGroups (apiURL, apikey): |
||||
|
data = {'apikey': apikey} |
||||
|
url = apiURL+'listAllGroups?'+urlencode(data) |
||||
|
if verbose: |
||||
|
print (url, file=sys.stderr) |
||||
|
resp = json.load(urlopen(url)) |
||||
|
return resp['data']['groupIDs'] |
||||
|
|
||||
|
def getPadText (padID, apiURL, apikey): |
||||
|
data = {'apikey': apikey, 'padID': padID} |
||||
|
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data))) |
||||
|
return resp['data']['text'] |
||||
|
|
||||
|
def getPadHTML (padID, apiURL, apikey): |
||||
|
data = {'apikey': apikey, 'padID': padID} |
||||
|
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data))) |
||||
|
return resp['data']['html'] |
||||
|
|
||||
|
def getPadLastEdited (padID, apiURL, apikey): |
||||
|
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID}))) |
||||
|
return r['data']['lastEdited'] |
||||
|
|
||||
|
def pad_split_group (n): |
||||
|
m = re.match(r"g\.(\w+)\$(.+)$", n) |
||||
|
if m: |
||||
|
return m.groups() |
||||
|
else: |
||||
|
return ('', n) |
||||
|
|
||||
|
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): |
||||
|
for padid in padids: |
||||
|
group_id, pad_name = pad_split_group(padid) |
||||
|
if group_id: |
||||
|
try: |
||||
|
os.mkdir(group_path) |
||||
|
except OSError: |
||||
|
pass |
||||
|
try: |
||||
|
os.mkdir(os.path.join(group_path, group_id)) |
||||
|
except OSError: |
||||
|
pass |
||||
|
fp = os.path.join(group_path, group_id, pad_name) |
||||
|
else: |
||||
|
try: |
||||
|
os.mkdir(pub_path) |
||||
|
except OSError: |
||||
|
pass |
||||
|
fp = os.path.join(pub_path, pad_name) |
||||
|
|
||||
|
if verbose: |
||||
|
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) |
||||
|
|
||||
|
if skip_existing: |
||||
|
if os.path.exists(fp+".json"): |
||||
|
continue |
||||
|
|
||||
|
# Write Metadata |
||||
|
meta = { |
||||
|
'padid': padid, |
||||
|
'groupID': group_id, |
||||
|
'padname': pad_name |
||||
|
} |
||||
|
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
||||
|
try: |
||||
|
resp = json.load(urlopen(url)) |
||||
|
meta['lastEdited'] = resp['data']['lastEdited'] |
||||
|
except (TypeError, HTTPError, ValueError) as e: |
||||
|
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
||||
|
|
||||
|
# Write Text |
||||
|
with open(fp+".utf8.txt", "w") as f: |
||||
|
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
||||
|
try: |
||||
|
resp = json.load(urlopen(url)) |
||||
|
text = resp['data']['text'].encode("utf-8") |
||||
|
f.write(text) |
||||
|
meta['text_length'] = len(text) |
||||
|
|
||||
|
except (TypeError, HTTPError, ValueError) as e: |
||||
|
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
||||
|
|
||||
|
with open(fp+ ".utf8.html", "w") as f: |
||||
|
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
||||
|
try: |
||||
|
resp = json.load(urlopen(url)) |
||||
|
text = resp['data']['html'].encode("utf-8") |
||||
|
f.write(text) |
||||
|
meta['html_length'] = len(text) |
||||
|
|
||||
|
except (TypeError, HTTPError, ValueError) as e: |
||||
|
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
||||
|
|
||||
|
with open(fp+".json", "w") as f: |
||||
|
f.write(json.dumps(meta)) |
||||
|
|
||||
|
if sleeptime: |
||||
|
time.sleep(sleeptime) |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
parser = argparse.ArgumentParser() |
||||
|
|
||||
|
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') |
||||
|
|
||||
|
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') |
||||
|
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') |
||||
|
parser.add_argument('--port', type=int, help='port of etherpad server') |
||||
|
parser.add_argument('--apikey', help='API key') |
||||
|
parser.add_argument('--apiversion', help='the version of the etherpad api') |
||||
|
parser.add_argument('--apiurl', help='URL path to the API') |
||||
|
|
||||
|
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') |
||||
|
parser.add_argument('--pubpath', default="pub", help='path to dump public pads') |
||||
|
parser.add_argument('--grouppath', default="priv", help='path to dump group pads') |
||||
|
parser.add_argument('--human', default=False, action="store_true", help='output for reading') |
||||
|
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') |
||||
|
|
||||
|
args = parser.parse_args() |
||||
|
|
||||
|
verbose = args.verbose |
||||
|
padinfo = PADINFO_DEFAULTS |
||||
|
if args.padinfo: |
||||
|
try: |
||||
|
with open(args.padinfo) as f: |
||||
|
for key, value in json.load(f).items(): |
||||
|
padinfo[key] = value |
||||
|
except IOError, e: |
||||
|
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) |
||||
|
except ValueError, e: |
||||
|
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) |
||||
|
|
||||
|
# allow explicit opts to override |
||||
|
if args.hostname: |
||||
|
padinfo['hostname'] = args.hostname |
||||
|
if args.port: |
||||
|
padinfo['port'] = args.port |
||||
|
if args.apikey: |
||||
|
padinfo['apikey'] = args.apikey |
||||
|
if args.apiversion: |
||||
|
padinfo['apiversion'] = args.apiversion |
||||
|
if args.apiurl: |
||||
|
padinfo['apiurl'] = args.apiurl |
||||
|
|
||||
|
# Construct the base API URL |
||||
|
apiurl = "http://" + padinfo.get("hostname") |
||||
|
if padinfo.get("port"): |
||||
|
apiurl += ":{0}".format(padinfo['port']) |
||||
|
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) |
||||
|
apikey = padinfo.get("apikey") |
||||
|
|
||||
|
if verbose: |
||||
|
print ("Connecting to {0}".format(apiurl), file=sys.stderr) |
||||
|
|
||||
|
############################### |
||||
|
# Command Dispatch |
||||
|
############################### |
||||
|
|
||||
|
cmd = args.command.lower() |
||||
|
if cmd == "listpads": |
||||
|
padids = listAllPads(apiurl, apikey) |
||||
|
if not args.human: |
||||
|
json.dump(padids, sys.stdout) |
||||
|
else: |
||||
|
for padid in padids: |
||||
|
print(padid) |
||||
|
|
||||
|
elif cmd == "listgroups": |
||||
|
groupids = listAllGroups(apiurl, apikey) |
||||
|
if not args.human: |
||||
|
json.dump(groupids, sys.stdout) |
||||
|
else: |
||||
|
for gid in groupids: |
||||
|
print(gid) |
||||
|
|
||||
|
elif cmd == "dump": |
||||
|
start = time.time() |
||||
|
padids = listAllPads(apiurl, apikey) |
||||
|
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) |
||||
|
if verbose: |
||||
|
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) |
||||
|
|
||||
|
else: |
||||
|
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr) |
Loading…
Reference in new issue