Pumping pads as files into publishing frameworks!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

454 lines
16 KiB

#!/usr/bin/env python
10 years ago
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
10 years ago
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
10 years ago
PADINFO_DEFAULTS = {
"hostname": "",
10 years ago
"port": 9001,
10 years ago
"apiversion": "1.2.9",
"apiurl": "/api/"
}
10 years ago
MODULE_PATH = (os.path.dirname(__file__))
TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates")
verbose = False
10 years ago
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
10 years ago
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
10 years ago
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
10 years ago
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode(data)))['data']['lastEdited']
try:
return datetime.fromtimestamp(int(raw)/1000)
except TypeError as e:
return None
def getPadURL (self, padID, groupinfo=None):
group, name = pad_split_group(padID)
if group:
gid = group
if gid.startswith("g."):
gid = gid[2:]
if groupinfo:
ginfo = groupinfo.get(gid)
if ginfo:
groupID = ginfo.get("id", 0)
else:
groupID = 0
else:
groupID = 0
return self.protocol+"://"+self.hostname+"/group.html/"+str(groupID)+"/pad.html/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
10 years ago
def get_template_env (tpath=None):
import jinja2
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None):
10 years ago
template_env = get_template_env(templates)
pad_template = template_env.get_template("pad.html")
numpads = len(padids)
for i, padid in enumerate(padids):
10 years ago
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
10 years ago
os.mkdir(os.path.join(outputpath, group_path))
10 years ago
except OSError:
pass
try:
10 years ago
os.mkdir(os.path.join(outputpath, group_path, group_id))
10 years ago
except OSError:
pass
10 years ago
fp = os.path.join(outputpath, group_path, group_id, pad_name)
10 years ago
else:
try:
10 years ago
os.mkdir(os.path.join(outputpath, pub_path))
10 years ago
except OSError:
pass
10 years ago
fp = os.path.join(outputpath, pub_path, pad_name)
10 years ago
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
10 years ago
else:
sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads))
sys.stderr.flush()
10 years ago
10 years ago
textpath = fp + ".txt"
htmlpath = fp+".html"
metapath = fp+".json"
last_edited = padserver.getPadLastEdited(padid)
if last_edited:
last_edited = last_edited.isoformat()
else:
last_edited = ''
10 years ago
if os.path.exists(metapath):
with open(metapath) as f:
meta = json.load(f)
if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited:
10 years ago
if verbose:
print("Up to date, skipping", file=sys.stderr)
continue
10 years ago
meta = {
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
10 years ago
}
10 years ago
10 years ago
meta['last_edited'] = last_edited
10 years ago
# Write Text
with open(textpath, "w") as f:
10 years ago
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
10 years ago
meta['text_path'] = os.path.relpath(textpath, outputpath)
10 years ago
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
10 years ago
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
10 years ago
meta['html_path'] = os.path.relpath(htmlpath, outputpath)
meta['html_length'] = len(html)
10 years ago
if pad_template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid, groupinfo)
meta['url'] = editurl
10 years ago
json_dump = json.dumps(meta)
f.write(pad_template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
10 years ago
metadata_json=json_dump).encode("utf-8")) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
10 years ago
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
10 years ago
10 years ago
with open(metapath, "w") as f:
10 years ago
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
10 years ago
if not verbose:
sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads))
sys.stderr.flush()
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
10 years ago
if __name__ == "__main__":
parser = argparse.ArgumentParser()
10 years ago
# command
10 years ago
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
10 years ago
# padinfo
10 years ago
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
10 years ago
parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .')
10 years ago
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
10 years ago
parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates')
10 years ago
# listpads/groups-specific
parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON')
# dump-specific
parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date')
parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# index-specific
10 years ago
parser.add_argument('--title', default="etherpad index &amp; archive", help='(index) title')
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups')
10 years ago
parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file')
10 years ago
parser.add_argument('--output', default=None, help='(index) path for output (default stdout)')
10 years ago
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
10 years ago
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
10 years ago
10 years ago
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
10 years ago
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
10 years ago
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
10 years ago
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
10 years ago
start = time.time()
padids = padserver.listAllPads()
10 years ago
if args.skip:
padids = padids[args.skip:]
if args.limit:
padids = padids[:args.limit]
10 years ago
dumpPads(
padserver,
padids,
10 years ago
args.outputpath,
args.pubpath,
args.grouppath,
10 years ago
force=args.force,
templates=args.templates,
groupinfo=groupinfo)
10 years ago
10 years ago
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
10 years ago
elif cmd == "index":
10 years ago
def augment_info(info, groupinfo):
if info.get("last_edited"):
10 years ago
dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" )
info['last_edited_parsed'] = dt
info['last_edited_str'] = str(dt)
if groupinfo:
gid = info.get("group_id")
if gid.startswith("g."):
gid = gid[2:]
if gid in groupinfo:
info[u"group_name"] = groupinfo[gid].get("name")
# print (info, file=sys.stderr)
return info
def get_pads(groupinfo=None):
10 years ago
pads = padids_from_path(os.path.join(args.outputpath, args.pubpath))
10 years ago
pads = [augment_info(x, groupinfo) for x in pads]
10 years ago
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
gp = os.path.join(args.outputpath, args.grouppath)
10 years ago
if not args.exclude_groups and gp:
groups = [os.path.join(gp, x) for x in os.listdir(gp)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
10 years ago
pad_infos = padids_from_path(gp)
pad_infos = [augment_info(x, groupinfo) for x in pad_infos]
pads.extend(pad_infos)
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
10 years ago
if verbose:
print ("Using groupinfo", file=sys.stderr)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
10 years ago
out = open(os.path.join(args.outputpath, args.output), "w")
import jinja2
10 years ago
env = get_template_env(args.templates)
index_template = env.get_template("index.html")
out.write(index_template.render(
pads = pads,
10 years ago
title = args.title,
timestamp = datetime.now()
10 years ago
).encode("utf-8"))
if args.output:
10 years ago
out.close()
10 years ago
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)