initial commit
This commit is contained in:
commit
771d76f67c
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
venv/
|
||||||
|
sites/
|
||||||
|
*.pyc
|
||||||
|
*~
|
4
README.md
Normal file
4
README.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
etherdump
|
||||||
|
=========
|
||||||
|
|
||||||
|
Tool to make archival dumps of etherpad pages.
|
206
etherdump
Executable file
206
etherdump
Executable file
@ -0,0 +1,206 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
from __future__ import print_function
|
||||||
|
import sys, argparse, json, re, os, time
|
||||||
|
from urllib2 import urlopen, HTTPError, URLError
|
||||||
|
import html5lib, urllib2, urllib
|
||||||
|
from urllib import urlencode
|
||||||
|
from urlparse import urljoin
|
||||||
|
|
||||||
|
PADINFO_DEFAULTS = {
|
||||||
|
"hostname": "",
|
||||||
|
"apiversion": "1.2.9",
|
||||||
|
"apiurl": "/api/"
|
||||||
|
}
|
||||||
|
verbose = False
|
||||||
|
|
||||||
|
def listAllPads (apiURL, apikey):
|
||||||
|
data = {'apikey': apikey}
|
||||||
|
url = apiURL+'listAllPads?'+urlencode(data)
|
||||||
|
if verbose:
|
||||||
|
print (url, file=sys.stderr)
|
||||||
|
resp = json.load(urlopen(url))
|
||||||
|
return resp['data']['padIDs']
|
||||||
|
|
||||||
|
def listAllGroups (apiURL, apikey):
|
||||||
|
data = {'apikey': apikey}
|
||||||
|
url = apiURL+'listAllGroups?'+urlencode(data)
|
||||||
|
if verbose:
|
||||||
|
print (url, file=sys.stderr)
|
||||||
|
resp = json.load(urlopen(url))
|
||||||
|
return resp['data']['groupIDs']
|
||||||
|
|
||||||
|
def getPadText (padID, apiURL, apikey):
|
||||||
|
data = {'apikey': apikey, 'padID': padID}
|
||||||
|
resp = json.load(urlopen(apiURL+'getText?'+urlencode(data)))
|
||||||
|
return resp['data']['text']
|
||||||
|
|
||||||
|
def getPadHTML (padID, apiURL, apikey):
|
||||||
|
data = {'apikey': apikey, 'padID': padID}
|
||||||
|
resp = json.load(urlopen(apiURL+'getHTML?'+urlencode(data)))
|
||||||
|
return resp['data']['html']
|
||||||
|
|
||||||
|
def getPadLastEdited (padID, apiURL, apikey):
|
||||||
|
r = json.load(urlopen(apiURL+'getHTML?'+urlencode({'apikey': apikey, 'padID': padID})))
|
||||||
|
return r['data']['lastEdited']
|
||||||
|
|
||||||
|
def pad_split_group (n):
|
||||||
|
m = re.match(r"g\.(\w+)\$(.+)$", n)
|
||||||
|
if m:
|
||||||
|
return m.groups()
|
||||||
|
else:
|
||||||
|
return ('', n)
|
||||||
|
|
||||||
|
def dumpPads (pub_path, group_path, apiurl, apikey, padids, sleeptime=0.01, skip_existing=False):
|
||||||
|
for padid in padids:
|
||||||
|
group_id, pad_name = pad_split_group(padid)
|
||||||
|
if group_id:
|
||||||
|
try:
|
||||||
|
os.mkdir(group_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
os.mkdir(os.path.join(group_path, group_id))
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
fp = os.path.join(group_path, group_id, pad_name)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
os.mkdir(pub_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
fp = os.path.join(pub_path, pad_name)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
|
||||||
|
|
||||||
|
if skip_existing:
|
||||||
|
if os.path.exists(fp+".json"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Write Metadata
|
||||||
|
meta = {
|
||||||
|
'padid': padid,
|
||||||
|
'groupID': group_id,
|
||||||
|
'padname': pad_name
|
||||||
|
}
|
||||||
|
url = apiurl + "getLastEdited?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||||
|
try:
|
||||||
|
resp = json.load(urlopen(url))
|
||||||
|
meta['lastEdited'] = resp['data']['lastEdited']
|
||||||
|
except (TypeError, HTTPError, ValueError) as e:
|
||||||
|
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||||
|
|
||||||
|
# Write Text
|
||||||
|
with open(fp+".utf8.txt", "w") as f:
|
||||||
|
url = apiurl + "getText?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||||
|
try:
|
||||||
|
resp = json.load(urlopen(url))
|
||||||
|
text = resp['data']['text'].encode("utf-8")
|
||||||
|
f.write(text)
|
||||||
|
meta['text_length'] = len(text)
|
||||||
|
|
||||||
|
except (TypeError, HTTPError, ValueError) as e:
|
||||||
|
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||||
|
|
||||||
|
with open(fp+ ".utf8.html", "w") as f:
|
||||||
|
url = apiurl + "getHTML?" + urlencode({'apikey':apikey, 'padID':padid.encode("utf-8")})
|
||||||
|
try:
|
||||||
|
resp = json.load(urlopen(url))
|
||||||
|
text = resp['data']['html'].encode("utf-8")
|
||||||
|
f.write(text)
|
||||||
|
meta['html_length'] = len(text)
|
||||||
|
|
||||||
|
except (TypeError, HTTPError, ValueError) as e:
|
||||||
|
print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
|
||||||
|
|
||||||
|
with open(fp+".json", "w") as f:
|
||||||
|
f.write(json.dumps(meta))
|
||||||
|
|
||||||
|
if sleeptime:
|
||||||
|
time.sleep(sleeptime)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
|
||||||
|
|
||||||
|
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
|
||||||
|
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
|
||||||
|
parser.add_argument('--port', type=int, help='port of etherpad server')
|
||||||
|
parser.add_argument('--apikey', help='API key')
|
||||||
|
parser.add_argument('--apiversion', help='the version of the etherpad api')
|
||||||
|
parser.add_argument('--apiurl', help='URL path to the API')
|
||||||
|
|
||||||
|
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
|
||||||
|
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
|
||||||
|
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
|
||||||
|
parser.add_argument('--human', default=False, action="store_true", help='output for reading')
|
||||||
|
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
verbose = args.verbose
|
||||||
|
padinfo = PADINFO_DEFAULTS
|
||||||
|
if args.padinfo:
|
||||||
|
try:
|
||||||
|
with open(args.padinfo) as f:
|
||||||
|
for key, value in json.load(f).items():
|
||||||
|
padinfo[key] = value
|
||||||
|
except IOError, e:
|
||||||
|
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
|
||||||
|
except ValueError, e:
|
||||||
|
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
|
||||||
|
|
||||||
|
# allow explicit opts to override
|
||||||
|
if args.hostname:
|
||||||
|
padinfo['hostname'] = args.hostname
|
||||||
|
if args.port:
|
||||||
|
padinfo['port'] = args.port
|
||||||
|
if args.apikey:
|
||||||
|
padinfo['apikey'] = args.apikey
|
||||||
|
if args.apiversion:
|
||||||
|
padinfo['apiversion'] = args.apiversion
|
||||||
|
if args.apiurl:
|
||||||
|
padinfo['apiurl'] = args.apiurl
|
||||||
|
|
||||||
|
# Construct the base API URL
|
||||||
|
apiurl = "http://" + padinfo.get("hostname")
|
||||||
|
if padinfo.get("port"):
|
||||||
|
apiurl += ":{0}".format(padinfo['port'])
|
||||||
|
apiurl += "{0}{1}/".format(padinfo['apiurl'], padinfo['apiversion'])
|
||||||
|
apikey = padinfo.get("apikey")
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
print ("Connecting to {0}".format(apiurl), file=sys.stderr)
|
||||||
|
|
||||||
|
###############################
|
||||||
|
# Command Dispatch
|
||||||
|
###############################
|
||||||
|
|
||||||
|
cmd = args.command.lower()
|
||||||
|
if cmd == "listpads":
|
||||||
|
padids = listAllPads(apiurl, apikey)
|
||||||
|
if not args.human:
|
||||||
|
json.dump(padids, sys.stdout)
|
||||||
|
else:
|
||||||
|
for padid in padids:
|
||||||
|
print(padid)
|
||||||
|
|
||||||
|
elif cmd == "listgroups":
|
||||||
|
groupids = listAllGroups(apiurl, apikey)
|
||||||
|
if not args.human:
|
||||||
|
json.dump(groupids, sys.stdout)
|
||||||
|
else:
|
||||||
|
for gid in groupids:
|
||||||
|
print(gid)
|
||||||
|
|
||||||
|
elif cmd == "dump":
|
||||||
|
start = time.time()
|
||||||
|
padids = listAllPads(apiurl, apikey)
|
||||||
|
dumpPads(args.pubpath, args.grouppath, apiurl, apikey, padids, args.skip_existing)
|
||||||
|
if verbose:
|
||||||
|
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print ("Command '{0}' not understood, try: listallpads, listallgroups, dumpallpads".format(args.command), file=sys.stderr)
|
Loading…
Reference in New Issue
Block a user