moved html / history dump to main etherdump

This commit is contained in:
Michael Murtaugh 2015-08-24 16:06:50 +02:00
parent d125f809fc
commit 24a3f4ac12
4 changed files with 550 additions and 548 deletions

View File

@ -1,164 +0,0 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os, re
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
import jinja2
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
padlinkpats = []
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid]
done = set()
count = 0
env = get_template_env(args.templates)
template = env.get_template(args.template)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
try:
os.makedirs(args.output)
except OSError:
pass
with open(out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
count += 1
if args.limit and count >= args.limit:
break

517
etherdump
View File

@ -1,413 +1,164 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function from __future__ import print_function
import sys, argparse, json, re, os, time from argparse import ArgumentParser
from urllib2 import urlopen, HTTPError, URLError import json, sys, os, re
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime from datetime import datetime
from padserver import PadServer import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
PADINFO_DEFAULTS = { from xml.etree import cElementTree as ET
"hostname": "", from trim import trim_removed_spans, contents, set_text_contents, text_contents
"port": 9001, from linkify import linkify, urlify, filename_to_padid
"apiversion": "1.2.9", import jinja2
"apiurl": "/api/"
}
MODULE_PATH = (os.path.dirname(__file__))
TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates")
verbose = False
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
def get_template_env (tpath=None): def get_template_env (tpath=None):
import jinja2
paths = [] paths = []
if tpath and os.path.isdir(tpath): if tpath and os.path.isdir(tpath):
paths.append(tpath) paths.append(tpath)
paths.append(TEMPLATES_PATH) # paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths) loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader) env = jinja2.Environment(loader=loader)
return env return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None):
template_env = get_template_env(templates)
pad_template = template_env.get_template("pad.html")
numpads = len(padids)
for i, padid in enumerate(padids):
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(os.path.join(outputpath, group_path))
except OSError:
pass
try:
os.mkdir(os.path.join(outputpath, group_path, group_id))
except OSError:
pass
fp = os.path.join(outputpath, group_path, group_id, pad_name)
else:
try:
os.mkdir(os.path.join(outputpath, pub_path))
except OSError:
pass
fp = os.path.join(outputpath, pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
else:
sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads))
sys.stderr.flush()
textpath = fp + ".txt" p = ArgumentParser("")
htmlpath = fp+".html" p.add_argument("padid", help="the padid")
metapath = fp+".json" p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
last_edited = padserver.getPadLastEdited(padid) p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
if last_edited: p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
last_edited = last_edited.isoformat() p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
else:
last_edited = ''
if os.path.exists(metapath): args = p.parse_args()
with open(metapath) as f: with open(args.padinfo) as f:
meta = json.load(f) info = json.load(f)
if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited:
if verbose:
print("Up to date, skipping", file=sys.stderr)
continue
meta = { apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
}
meta['last_edited'] = last_edited padlinkpats = []
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
# Write Text linkpats.extend(zip(args.linksearch, args.linkreplace))
with open(textpath, "w") as f:
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = os.path.relpath(textpath, outputpath)
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e: if args.verbose:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) print ("using padlinkpats", padlinkpats)
with open(htmlpath, "w") as f: todo = [args.padid]
html = padserver.getPadHTML(padid) done = set()
meta['html_path'] = os.path.relpath(htmlpath, outputpath) count = 0
meta['html_length'] = len(html)
if pad_template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid, groupinfo)
meta['url'] = editurl
json_dump = json.dumps(meta)
f.write(pad_template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json_dump).encode("utf-8")) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
# except (TypeError, HTTPError, ValueError) as e: env = get_template_env(args.templates)
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) template = env.get_template(args.template)
with open(metapath, "w") as f: while len(todo) > 0:
f.write(json.dumps(meta)) padid = todo[0]
todo = todo[1:]
done.add(padid)
if sleeptime: data = {}
time.sleep(sleeptime) data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
if not verbose: if args.verbose:
sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads)) print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
sys.stderr.flush() out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
def humanize_bytes(bytes, precision=0): try:
"""Return a humanized string representation of a number of bytes. os.makedirs(args.output)
except OSError:
Assumes `from __future__ import division`. pass
with open(out, "w") as f:
>>> humanize_bytes(1) # f.write(html.encode("utf-8"))
'1 byte' f.write(template.render(
>>> humanize_bytes(1024) html = html,
'1.0 kB' style = style,
>>> humanize_bytes(1024*123) revision = total_revisions,
'123.0 kB' padid = padid,
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# command
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
# padinfo
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates')
# listpads/groups-specific
parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON')
# dump-specific
parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date')
parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# index-specific
parser.add_argument('--title', default="etherpad index &amp; archive", help='(index) title')
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file')
parser.add_argument('--output', default=None, help='(index) path for output (default stdout)')
parser.add_argument('--pad', default="start", help='(history) pad id')
parser.add_argument('--rev', default="", help='(history) revision id')
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
start = time.time()
padids = padserver.listAllPads()
if args.skip:
padids = padids[args.skip:]
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.outputpath,
args.pubpath,
args.grouppath,
force=args.force,
templates=args.templates,
groupinfo=groupinfo)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "index":
def augment_info(info, groupinfo):
if info.get("last_edited"):
dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" )
info['last_edited_parsed'] = dt
info['last_edited_str'] = str(dt)
if groupinfo:
gid = info.get("group_id")
if gid.startswith("g."):
gid = gid[2:]
if gid in groupinfo:
info[u"group_name"] = groupinfo[gid].get("name")
# print (info, file=sys.stderr)
return info
def get_pads(groupinfo=None):
pads = padids_from_path(os.path.join(args.outputpath, args.pubpath))
pads = [augment_info(x, groupinfo) for x in pads]
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
gp = os.path.join(args.outputpath, args.grouppath)
if not args.exclude_groups and gp:
groups = [os.path.join(gp, x) for x in os.listdir(gp)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pad_infos = padids_from_path(gp)
pad_infos = [augment_info(x, groupinfo) for x in pad_infos]
pads.extend(pad_infos)
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(os.path.join(args.outputpath, args.output), "w")
import jinja2
env = get_template_env(args.templates)
index_template = env.get_template("index.html")
out.write(index_template.render(
pads = pads,
title = args.title,
timestamp = datetime.now() timestamp = datetime.now()
).encode("utf-8")) ).encode("utf-8"))
if args.output: count += 1
out.close() if args.limit and count >= args.limit:
break
elif cmd == "revisions":
print (padserver.getRevisionsCount(args.pad))
elif cmd == "authors":
print (padserver.listAuthorsOfPad(args.pad))
elif cmd == "changeset":
print (padserver.getRevisionChangeset(args.pad, args.rev))
elif cmd == "history":
revs = padserver.getRevisionsCount(args.pad)
data = padserver.createDiffHTML(args.pad, 1, revs)
print (data['html'])
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)

413
etherdump_original Executable file
View File

@ -0,0 +1,413 @@
#!/usr/bin/env python
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
from padserver import PadServer
PADINFO_DEFAULTS = {
"hostname": "",
"port": 9001,
"apiversion": "1.2.9",
"apiurl": "/api/"
}
MODULE_PATH = (os.path.dirname(__file__))
TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates")
verbose = False
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
def get_template_env (tpath=None):
import jinja2
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None):
template_env = get_template_env(templates)
pad_template = template_env.get_template("pad.html")
numpads = len(padids)
for i, padid in enumerate(padids):
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(os.path.join(outputpath, group_path))
except OSError:
pass
try:
os.mkdir(os.path.join(outputpath, group_path, group_id))
except OSError:
pass
fp = os.path.join(outputpath, group_path, group_id, pad_name)
else:
try:
os.mkdir(os.path.join(outputpath, pub_path))
except OSError:
pass
fp = os.path.join(outputpath, pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
else:
sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads))
sys.stderr.flush()
textpath = fp + ".txt"
htmlpath = fp+".html"
metapath = fp+".json"
last_edited = padserver.getPadLastEdited(padid)
if last_edited:
last_edited = last_edited.isoformat()
else:
last_edited = ''
if os.path.exists(metapath):
with open(metapath) as f:
meta = json.load(f)
if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited:
if verbose:
print("Up to date, skipping", file=sys.stderr)
continue
meta = {
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
}
meta['last_edited'] = last_edited
# Write Text
with open(textpath, "w") as f:
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = os.path.relpath(textpath, outputpath)
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
meta['html_path'] = os.path.relpath(htmlpath, outputpath)
meta['html_length'] = len(html)
if pad_template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid, groupinfo)
meta['url'] = editurl
json_dump = json.dumps(meta)
f.write(pad_template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json_dump).encode("utf-8")) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(metapath, "w") as f:
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
if not verbose:
sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads))
sys.stderr.flush()
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# command
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
# padinfo
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates')
# listpads/groups-specific
parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON')
# dump-specific
parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date')
parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# index-specific
parser.add_argument('--title', default="etherpad index &amp; archive", help='(index) title')
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file')
parser.add_argument('--output', default=None, help='(index) path for output (default stdout)')
parser.add_argument('--pad', default="start", help='(history) pad id')
parser.add_argument('--rev', default="", help='(history) revision id')
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
start = time.time()
padids = padserver.listAllPads()
if args.skip:
padids = padids[args.skip:]
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.outputpath,
args.pubpath,
args.grouppath,
force=args.force,
templates=args.templates,
groupinfo=groupinfo)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "index":
def augment_info(info, groupinfo):
if info.get("last_edited"):
dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" )
info['last_edited_parsed'] = dt
info['last_edited_str'] = str(dt)
if groupinfo:
gid = info.get("group_id")
if gid.startswith("g."):
gid = gid[2:]
if gid in groupinfo:
info[u"group_name"] = groupinfo[gid].get("name")
# print (info, file=sys.stderr)
return info
def get_pads(groupinfo=None):
pads = padids_from_path(os.path.join(args.outputpath, args.pubpath))
pads = [augment_info(x, groupinfo) for x in pads]
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
gp = os.path.join(args.outputpath, args.grouppath)
if not args.exclude_groups and gp:
groups = [os.path.join(gp, x) for x in os.listdir(gp)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pad_infos = padids_from_path(gp)
pad_infos = [augment_info(x, groupinfo) for x in pad_infos]
pads.extend(pad_infos)
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(os.path.join(args.outputpath, args.output), "w")
import jinja2
env = get_template_env(args.templates)
index_template = env.get_template("index.html")
out.write(index_template.render(
pads = pads,
title = args.title,
timestamp = datetime.now()
).encode("utf-8"))
if args.output:
out.close()
elif cmd == "revisions":
print (padserver.getRevisionsCount(args.pad))
elif cmd == "authors":
print (padserver.listAuthorsOfPad(args.pad))
elif cmd == "changeset":
print (padserver.getRevisionChangeset(args.pad, args.rev))
elif cmd == "history":
revs = padserver.getRevisionsCount(args.pad)
data = padserver.createDiffHTML(args.pad, 1, revs)
print (data['html'])
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)

View File

@ -21,10 +21,12 @@ def linkify (src, urlify=urlify):
contents = strip_tags(m.group(1)) contents = strip_tags(m.group(1))
collect.append(contents) collect.append(contents)
link = urlify(contents) link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
src = re.sub(r"\[\[(.+?)\]\]", s, src) ## question marks are ignored by etherpad, so split/strip it
src = re.sub(r"\[\[(.+?)(\?.*)?\]\]", s, src)
return (src, collect) return (src, collect)