initial commit
This commit is contained in:
commit
771d76f67c
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
venv/
|
||||
sites/
|
||||
*.pyc
|
||||
*~
|
4
README.md
Normal file
4
README.md
Normal file
@ -0,0 +1,4 @@
|
||||
etherdump
|
||||
=========
|
||||
|
||||
Tool to make archival dumps of etherpad pages.
|
206
etherdump
Executable file
206
etherdump
Executable file
@ -0,0 +1,206 @@
|
||||
#!/usr/bin/python
|
||||
|
||||
from __future__ import print_function
|
||||
import sys, argparse, json, re, os, time
|
||||
from urllib2 import urlopen, HTTPError, URLError
|
||||
import html5lib, urllib2, urllib
|
||||
from urllib import urlencode
|
||||
from urlparse import urljoin
|
||||
|
||||
PADINFO_DEFAULTS = {
|
||||
"hostname": "",
|
||||
"apiversion": "1.2.9",
|
||||
"apiurl": "/api/"
|
||||
}
|
||||
verbose = False
|
||||
|
||||
def listAllPads (apiURL, apikey):
|
||||
data = {'apikey': apikey}
|
||||
url = apiURL+'listAllPads?'+urlencode(data)
|
||||
if verbose:
|
||||
print (url, file=sys.stderr)
|
||||
resp = json.load(urlopen(url))
|
||||
return resp['data']['padIDs']
|
||||
|
||||
def listAllGroups (apiURL, apikey):
|
||||
data = {'apikey': apikey}
|
||||
url = apiURL+'listAllGroups?'+urlencode(data)
|
||||
if verbose:
|
||||
print (url, file=sys.stderr)
|
||||
resp = json.load(urlopen(url))
|
||||
return resp['data']['groupIDs']
|
||||
|
||||
def getPadText (padID, apiURL, apikey):
|
||||
data = {'apikey': apikey, 'padID': padID}
|
||||
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data)))
|
||||
return resp['data']['text']
|
||||
|
||||
def getPadHTML (padID, apiURL, apikey):
|
||||
data = {'apikey': apikey, 'padID': padID}
|
||||
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data)))
|
||||
return resp['data']['html']
|
||||
|
||||
def getPadLastEdited (padID, apiURL, apikey):
|
||||
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID})))
|
||||
return r['data']['lastEdited']
|
||||
|
||||
def pad_split_group (n):
|
||||
m = re.match(r"g\.(\w+)\$(.+)$", n)
|
||||
if m:
|
||||
return m.groups()
|
||||
else:
|
||||
return ('', n)
|
||||
|
||||
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False):
|
||||
for padid in padids:
|
||||
group_id, pad_name = pad_split_group(padid)
|
||||
if group_id:
|
||||
try:
|
||||
os.mkdir(group_path)
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
os.mkdir(os.path.join(group_path, group_id))
|
||||
except OSError:
|
||||
pass
|
||||
fp = os.path.join(group_path, group_id, pad_name)
|
||||
else:
|
||||
try:
|
||||
os.mkdir(pub_path)
|
||||
except OSError:
|
||||
pass
|
||||
fp = os.path.join(pub_path, pad_name)
|
||||
|
||||
if verbose:
|
||||
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
|
||||
|
||||
if skip_existing:
|
||||
if os.path.exists(fp+".json"):
|
||||
continue
|
||||
|
||||
# Write Metadata
|
||||
meta = {
|
||||
'padid': padid,
|
||||
'groupID': group_id,
|
||||
'padname': pad_name
|
||||
}
|
||||
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||
try:
|
||||
resp = json.load(urlopen(url))
|
||||
meta['lastEdited'] = resp['data']['lastEdited']
|
||||
except (TypeError, HTTPError, ValueError) as e:
|
||||
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||
|
||||
# Write Text
|
||||
with open(fp+".utf8.txt", "w") as f:
|
||||
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||
try:
|
||||
resp = json.load(urlopen(url))
|
||||
text = resp['data']['text'].encode("utf-8")
|
||||
f.write(text)
|
||||
meta['text_length'] = len(text)
|
||||
|
||||
except (TypeError, HTTPError, ValueError) as e:
|
||||
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||
|
||||
with open(fp+ ".utf8.html", "w") as f:
|
||||
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||
try:
|
||||
resp = json.load(urlopen(url))
|
||||
text = resp['data']['html'].encode("utf-8")
|
||||
f.write(text)
|
||||
meta['html_length'] = len(text)
|
||||
|
||||
except (TypeError, HTTPError, ValueError) as e:
|
||||
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||
|
||||
with open(fp+".json", "w") as f:
|
||||
f.write(json.dumps(meta))
|
||||
|
||||
if sleeptime:
|
||||
time.sleep(sleeptime)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
|
||||
|
||||
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
|
||||
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
|
||||
parser.add_argument('--port', type=int, help='port of etherpad server')
|
||||
parser.add_argument('--apikey', help='API key')
|
||||
parser.add_argument('--apiversion', help='the version of the etherpad api')
|
||||
parser.add_argument('--apiurl', help='URL path to the API')
|
||||
|
||||
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
|
||||
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
|
||||
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
|
||||
parser.add_argument('--human', default=False, action="store_true", help='output for reading')
|
||||
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
verbose = args.verbose
|
||||
padinfo = PADINFO_DEFAULTS
|
||||
if args.padinfo:
|
||||
try:
|
||||
with open(args.padinfo) as f:
|
||||
for key, value in json.load(f).items():
|
||||
padinfo[key] = value
|
||||
except IOError, e:
|
||||
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
|
||||
except ValueError, e:
|
||||
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
|
||||
|
||||
# allow explicit opts to override
|
||||
if args.hostname:
|
||||
padinfo['hostname'] = args.hostname
|
||||
if args.port:
|
||||
padinfo['port'] = args.port
|
||||
if args.apikey:
|
||||
padinfo['apikey'] = args.apikey
|
||||
if args.apiversion:
|
||||
padinfo['apiversion'] = args.apiversion
|
||||
if args.apiurl:
|
||||
padinfo['apiurl'] = args.apiurl
|
||||
|
||||
# Construct the base API URL
|
||||
apiurl = "http://" + padinfo.get("hostname")
|
||||
if padinfo.get("port"):
|
||||
apiurl += ":{0}".format(padinfo['port'])
|
||||
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion'])
|
||||
apikey = padinfo.get("apikey")
|
||||
|
||||
if verbose:
|
||||
print ("Connecting to {0}".format(apiurl), file=sys.stderr)
|
||||
|
||||
###############################
|
||||
# Command Dispatch
|
||||
###############################
|
||||
|
||||
cmd = args.command.lower()
|
||||
if cmd == "listpads":
|
||||
padids = listAllPads(apiurl, apikey)
|
||||
if not args.human:
|
||||
json.dump(padids, sys.stdout)
|
||||
else:
|
||||
for padid in padids:
|
||||
print(padid)
|
||||
|
||||
elif cmd == "listgroups":
|
||||
groupids = listAllGroups(apiurl, apikey)
|
||||
if not args.human:
|
||||
json.dump(groupids, sys.stdout)
|
||||
else:
|
||||
for gid in groupids:
|
||||
print(gid)
|
||||
|
||||
elif cmd == "dump":
|
||||
start = time.time()
|
||||
padids = listAllPads(apiurl, apikey)
|
||||
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing)
|
||||
if verbose:
|
||||
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
|
||||
|
||||
else:
|
||||
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)
|
Loading…
Reference in New Issue
Block a user