Michael Murtaugh
10 years ago
commit
771d76f67c
3 changed files with 214 additions and 0 deletions
@ -0,0 +1,4 @@ |
|||
venv/ |
|||
sites/ |
|||
*.pyc |
|||
*~ |
@ -0,0 +1,4 @@ |
|||
etherdump |
|||
========= |
|||
|
|||
Tool to make archival dumps of etherpad pages. |
@ -0,0 +1,206 @@ |
|||
#!/usr/bin/python |
|||
|
|||
from __future__ import print_function |
|||
import sys, argparse, json, re, os, time |
|||
from urllib2 import urlopen, HTTPError, URLError |
|||
import html5lib, urllib2, urllib |
|||
from urllib import urlencode |
|||
from urlparse import urljoin |
|||
|
|||
PADINFO_DEFAULTS = { |
|||
"hostname": "", |
|||
"apiversion": "1.2.9", |
|||
"apiurl": "/api/" |
|||
} |
|||
verbose = False |
|||
|
|||
def listAllPads (apiURL, apikey): |
|||
data = {'apikey': apikey} |
|||
url = apiURL+'listAllPads?'+urlencode(data) |
|||
if verbose: |
|||
print (url, file=sys.stderr) |
|||
resp = json.load(urlopen(url)) |
|||
return resp['data']['padIDs'] |
|||
|
|||
def listAllGroups (apiURL, apikey): |
|||
data = {'apikey': apikey} |
|||
url = apiURL+'listAllGroups?'+urlencode(data) |
|||
if verbose: |
|||
print (url, file=sys.stderr) |
|||
resp = json.load(urlopen(url)) |
|||
return resp['data']['groupIDs'] |
|||
|
|||
def getPadText (padID, apiURL, apikey): |
|||
data = {'apikey': apikey, 'padID': padID} |
|||
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data))) |
|||
return resp['data']['text'] |
|||
|
|||
def getPadHTML (padID, apiURL, apikey): |
|||
data = {'apikey': apikey, 'padID': padID} |
|||
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data))) |
|||
return resp['data']['html'] |
|||
|
|||
def getPadLastEdited (padID, apiURL, apikey): |
|||
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID}))) |
|||
return r['data']['lastEdited'] |
|||
|
|||
def pad_split_group (n): |
|||
m = re.match(r"g\.(\w+)\$(.+)$", n) |
|||
if m: |
|||
return m.groups() |
|||
else: |
|||
return ('', n) |
|||
|
|||
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False): |
|||
for padid in padids: |
|||
group_id, pad_name = pad_split_group(padid) |
|||
if group_id: |
|||
try: |
|||
os.mkdir(group_path) |
|||
except OSError: |
|||
pass |
|||
try: |
|||
os.mkdir(os.path.join(group_path, group_id)) |
|||
except OSError: |
|||
pass |
|||
fp = os.path.join(group_path, group_id, pad_name) |
|||
else: |
|||
try: |
|||
os.mkdir(pub_path) |
|||
except OSError: |
|||
pass |
|||
fp = os.path.join(pub_path, pad_name) |
|||
|
|||
if verbose: |
|||
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) |
|||
|
|||
if skip_existing: |
|||
if os.path.exists(fp+".json"): |
|||
continue |
|||
|
|||
# Write Metadata |
|||
meta = { |
|||
'padid': padid, |
|||
'groupID': group_id, |
|||
'padname': pad_name |
|||
} |
|||
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
|||
try: |
|||
resp = json.load(urlopen(url)) |
|||
meta['lastEdited'] = resp['data']['lastEdited'] |
|||
except (TypeError, HTTPError, ValueError) as e: |
|||
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
|||
|
|||
# Write Text |
|||
with open(fp+".utf8.txt", "w") as f: |
|||
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
|||
try: |
|||
resp = json.load(urlopen(url)) |
|||
text = resp['data']['text'].encode("utf-8") |
|||
f.write(text) |
|||
meta['text_length'] = len(text) |
|||
|
|||
except (TypeError, HTTPError, ValueError) as e: |
|||
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
|||
|
|||
with open(fp+ ".utf8.html", "w") as f: |
|||
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")}) |
|||
try: |
|||
resp = json.load(urlopen(url)) |
|||
text = resp['data']['html'].encode("utf-8") |
|||
f.write(text) |
|||
meta['html_length'] = len(text) |
|||
|
|||
except (TypeError, HTTPError, ValueError) as e: |
|||
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) |
|||
|
|||
with open(fp+".json", "w") as f: |
|||
f.write(json.dumps(meta)) |
|||
|
|||
if sleeptime: |
|||
time.sleep(sleeptime) |
|||
|
|||
if __name__ == "__main__": |
|||
parser = argparse.ArgumentParser() |
|||
|
|||
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') |
|||
|
|||
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') |
|||
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') |
|||
parser.add_argument('--port', type=int, help='port of etherpad server') |
|||
parser.add_argument('--apikey', help='API key') |
|||
parser.add_argument('--apiversion', help='the version of the etherpad api') |
|||
parser.add_argument('--apiurl', help='URL path to the API') |
|||
|
|||
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') |
|||
parser.add_argument('--pubpath', default="pub", help='path to dump public pads') |
|||
parser.add_argument('--grouppath', default="priv", help='path to dump group pads') |
|||
parser.add_argument('--human', default=False, action="store_true", help='output for reading') |
|||
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') |
|||
|
|||
args = parser.parse_args() |
|||
|
|||
verbose = args.verbose |
|||
padinfo = PADINFO_DEFAULTS |
|||
if args.padinfo: |
|||
try: |
|||
with open(args.padinfo) as f: |
|||
for key, value in json.load(f).items(): |
|||
padinfo[key] = value |
|||
except IOError, e: |
|||
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) |
|||
except ValueError, e: |
|||
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) |
|||
|
|||
# allow explicit opts to override |
|||
if args.hostname: |
|||
padinfo['hostname'] = args.hostname |
|||
if args.port: |
|||
padinfo['port'] = args.port |
|||
if args.apikey: |
|||
padinfo['apikey'] = args.apikey |
|||
if args.apiversion: |
|||
padinfo['apiversion'] = args.apiversion |
|||
if args.apiurl: |
|||
padinfo['apiurl'] = args.apiurl |
|||
|
|||
# Construct the base API URL |
|||
apiurl = "http://" + padinfo.get("hostname") |
|||
if padinfo.get("port"): |
|||
apiurl += ":{0}".format(padinfo['port']) |
|||
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion']) |
|||
apikey = padinfo.get("apikey") |
|||
|
|||
if verbose: |
|||
print ("Connecting to {0}".format(apiurl), file=sys.stderr) |
|||
|
|||
############################### |
|||
# Command Dispatch |
|||
############################### |
|||
|
|||
cmd = args.command.lower() |
|||
if cmd == "listpads": |
|||
padids = listAllPads(apiurl, apikey) |
|||
if not args.human: |
|||
json.dump(padids, sys.stdout) |
|||
else: |
|||
for padid in padids: |
|||
print(padid) |
|||
|
|||
elif cmd == "listgroups": |
|||
groupids = listAllGroups(apiurl, apikey) |
|||
if not args.human: |
|||
json.dump(groupids, sys.stdout) |
|||
else: |
|||
for gid in groupids: |
|||
print(gid) |
|||
|
|||
elif cmd == "dump": |
|||
start = time.time() |
|||
padids = listAllPads(apiurl, apikey) |
|||
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing) |
|||
if verbose: |
|||
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) |
|||
|
|||
else: |
|||
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr) |
Loading…
Reference in new issue