Pumping pads as files into publishing frameworks!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

363 lines
12 KiB

#!/usr/bin/env python
10 years ago
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
10 years ago
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
10 years ago
PADINFO_DEFAULTS = {
"hostname": "",
"apiversion": "1.2.9",
"apiurl": "/api/"
}
verbose = False
10 years ago
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
data = {'apikey': self.apikey, 'padID': padID}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited']
return datetime.fromtimestamp(int(raw)/1000)
def getPadURL (self, padID):
group, name = pad_split_group(padID)
if group:
return self.protocol+"://"+self.hostname+"/p/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, template=None):
if template != None:
import jinja2
with open(template) as f:
template = jinja2.Template(f.read().decode("utf-8"))
10 years ago
for padid in padids:
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(group_path)
except OSError:
pass
try:
os.mkdir(os.path.join(group_path, group_id))
except OSError:
pass
fp = os.path.join(group_path, group_id, pad_name)
else:
try:
os.mkdir(pub_path)
except OSError:
pass
fp = os.path.join(pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
if skip_existing:
if os.path.exists(fp+".json"):
continue
# Write Metadata
meta = {
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
10 years ago
}
meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat()
10 years ago
# Write Text
textpath = fp + ".txt"
with open(textpath, "w") as f:
10 years ago
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = textpath
10 years ago
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
10 years ago
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
htmlpath = fp+".html"
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
meta['html_path'] = htmlpath
meta['html_length'] = len(html)
if template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid)
meta['url'] = editurl
f.write(template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json.dumps(meta))) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
10 years ago
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
10 years ago
with open(fp+".json", "w") as f:
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
10 years ago
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON')
10 years ago
parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# DUMP
parser.add_argument('--template', default="templates/pad.html", help='path for (dump) template, default: templates/pad.html')
# OPTIONS SPECIFIC TO CREATEINDEX
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(createindex) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file')
parser.add_argument('--indextemplate', default="templates/index.html", help='(createindex) path for template, default: templates/index.html')
parser.add_argument('--indextitle', default="etherpad archive & index", help='(createindex) title')
parser.add_argument('--indexcss', default="styles.css", help='(createindex) index: css url')
parser.add_argument('--output', default=None, help='(createindex) path for output (default stdout)')
10 years ago
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
10 years ago
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
10 years ago
if verbose:
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
10 years ago
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
10 years ago
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
10 years ago
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
start = time.time()
padids = padserver.listAllPads()
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.pubpath,
args.grouppath,
args.skip_existing,
template=args.template)
10 years ago
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "createindex":
def get_pads(groupinfo=None):
pads = padids_from_path(args.pubpath)
print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
if not args.exclude_groups and os.path.exists(args.grouppath):
groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pads.extend(padids_from_path(gp))
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(args.output, "w")
import jinja2
with open(args.indextemplate) as f:
template = jinja2.Template(f.read().decode("utf-8"))
out.write(template.render(
title=args.indextitle,
css=args.indexcss,
pads = pads
))
if args.output:
output.close()
10 years ago
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)