From 24a3f4ac12d59981b1c99d66c409d459601e28a5 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Mon, 24 Aug 2015 16:06:50 +0200 Subject: [PATCH] moved html / history dump to main etherdump --- dump_html.py | 164 -------------- etherdump | 535 ++++++++++++--------------------------------- etherdump_original | 413 ++++++++++++++++++++++++++++++++++ linkify.py | 4 +- 4 files changed, 559 insertions(+), 557 deletions(-) delete mode 100755 dump_html.py create mode 100755 etherdump_original diff --git a/dump_html.py b/dump_html.py deleted file mode 100755 index 7432968..0000000 --- a/dump_html.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from argparse import ArgumentParser -import json, sys, os, re -from datetime import datetime -import html5lib -from urllib import urlencode -from urllib2 import urlopen, HTTPError, URLError -from xml.etree import cElementTree as ET -from trim import trim_removed_spans, contents, set_text_contents, text_contents -from linkify import linkify, urlify, filename_to_padid -import jinja2 - - -def get_template_env (tpath=None): - paths = [] - if tpath and os.path.isdir(tpath): - paths.append(tpath) - # paths.append(TEMPLATES_PATH) - loader = jinja2.FileSystemLoader(paths) - env = jinja2.Environment(loader=loader) - return env - - -p = ArgumentParser("") -p.add_argument("padid", help="the padid") -p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") -p.add_argument("--output", default="output", help="path to save files, default: output") -p.add_argument("--verbose", default=False, action="store_true") -p.add_argument("--limit", type=int, default=None) -p.add_argument("--templates", default="templates") -p.add_argument("--template", default="pad_html.html") - -p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") -p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") -p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") - -args = p.parse_args() -with open(args.padinfo) as f: - info = json.load(f) - -apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) - -padlinkpats = [] -if "padlink" in info: - if type(info['padlink']) == list: - padlinkpats.extend(info['padlink']) - else: - padlinkpats.append(info['padlink']) -padlinkpats.extend(args.padlink) - -linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] -linkpats.extend(zip(args.linksearch, args.linkreplace)) - -if args.verbose: - print ("using padlinkpats", padlinkpats) - -todo = [args.padid] -done = set() -count = 0 - -env = get_template_env(args.templates) -template = env.get_template(args.template) - -while len(todo) > 0: - padid = todo[0] - todo = todo[1:] - done.add(padid) - - data = {} - data['apikey'] = info['apikey'] - data['padID'] = padid.encode("utf-8") - - if args.verbose: - print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) - out = "{0}/{1}".format(args.output, urlify(padid)) - print ("{0}".format(out), file=sys.stderr) - - total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) - total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] - if args.verbose: - print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) - - data['startRev'] = "0" - requesturl = apiurl+'createDiffHTML?'+urlencode(data) - html = json.load(urlopen(requesturl))['data']['html'] - t = html5lib.parse(html, namespaceHTMLElements=False) - trim_removed_spans(t) - html = ET.tostring(t, method="html") - - # Stage 1: Process as text - # Process [[wikilink]] style links - # and add linked page names to spider todo list - html, links = linkify(html) - for l in links: - if l not in todo and l not in done: - if args.verbose: - print (" link: {0}".format(l), file=sys.stderr) - todo.append(l) - - # Stage 2: Process as ElementTree - # - t = html5lib.parse(html, namespaceHTMLElements=False) - # apply linkpats - for a in t.findall(".//a"): - href = a.attrib.get("href") - original_href = href - if href: - # if args.verbose: - # print ("searching for PADLINK: {0}".format(href)) - for pat in padlinkpats: - if re.search(pat, href) != None: - # if args.verbose: - # print (" found PADLINK: {0}".format(href)) - href = re.sub(pat, "\\1.html", href) - padid = filename_to_padid(href) - set_text_contents(a, "[[{0}]]".format(padid)) - if padid not in todo and padid not in done: - if args.verbose: - print (" link: {0}".format(padid), file=sys.stderr) - todo.append(padid) - # apply linkpats - for s, r in linkpats: - href = re.sub(s, r, href) - if href != original_href: - old_contents = text_contents(a) - # print ("OLD_CONTENTS {0}".format(old_contents)) - if old_contents == original_href: - if args.verbose: - print (" Updating href IN TEXT", file=sys.stderr) - set_text_contents(a, href) - - if original_href != href: - if args.verbose: - print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) - a.attrib['href'] = href - - # extract the style tag (with authorship colors) - style = t.find(".//style") - if style != None: - style = ET.tostring(style, method="html") - else: - style = "" - # and extract the contents of the body - html = contents(t.find(".//body")) - - - try: - os.makedirs(args.output) - except OSError: - pass - with open(out, "w") as f: - # f.write(html.encode("utf-8")) - f.write(template.render( - html = html, - style = style, - revision = total_revisions, - padid = padid, - timestamp = datetime.now() - ).encode("utf-8")) - - count += 1 - if args.limit and count >= args.limit: - break diff --git a/etherdump b/etherdump index c8625b5..7432968 100755 --- a/etherdump +++ b/etherdump @@ -1,413 +1,164 @@ #!/usr/bin/env python - from __future__ import print_function -import sys, argparse, json, re, os, time -from urllib2 import urlopen, HTTPError, URLError -import html5lib, urllib2, urllib -from xml.etree import ElementTree as ET -from urllib import urlencode -from urlparse import urljoin +from argparse import ArgumentParser +import json, sys, os, re from datetime import datetime -from padserver import PadServer - - -PADINFO_DEFAULTS = { - "hostname": "", - "port": 9001, - "apiversion": "1.2.9", - "apiurl": "/api/" -} - -MODULE_PATH = (os.path.dirname(__file__)) -TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates") - -verbose = False - -def pad_split_group (n): - m = re.match(r"g\.(\w+)\$(.+)$", n) - if m: - return m.groups() - else: - return ('', n) - -def content(tag): - if tag.text == None: - return u''.join(ET.tostring(e) for e in tag) - else: - return tag.text + u''.join(ET.tostring(e) for e in tag) +import html5lib +from urllib import urlencode +from urllib2 import urlopen, HTTPError, URLError +from xml.etree import cElementTree as ET +from trim import trim_removed_spans, contents, set_text_contents, text_contents +from linkify import linkify, urlify, filename_to_padid +import jinja2 def get_template_env (tpath=None): - import jinja2 paths = [] if tpath and os.path.isdir(tpath): paths.append(tpath) - paths.append(TEMPLATES_PATH) + # paths.append(TEMPLATES_PATH) loader = jinja2.FileSystemLoader(paths) env = jinja2.Environment(loader=loader) return env - # template = env.get_template('pad.html') - # print template.render(the='variables', go='here').encode("utf-8") - -def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None): - template_env = get_template_env(templates) - pad_template = template_env.get_template("pad.html") - numpads = len(padids) - for i, padid in enumerate(padids): - group_id, pad_name = pad_split_group(padid) - if group_id: - try: - os.mkdir(os.path.join(outputpath, group_path)) - except OSError: - pass - try: - os.mkdir(os.path.join(outputpath, group_path, group_id)) - except OSError: - pass - fp = os.path.join(outputpath, group_path, group_id, pad_name) - else: - try: - os.mkdir(os.path.join(outputpath, pub_path)) - except OSError: - pass - fp = os.path.join(outputpath, pub_path, pad_name) - - if verbose: - print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) - else: - sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads)) - sys.stderr.flush() - - - textpath = fp + ".txt" - htmlpath = fp+".html" - metapath = fp+".json" - - last_edited = padserver.getPadLastEdited(padid) - if last_edited: - last_edited = last_edited.isoformat() - else: - last_edited = '' - - if os.path.exists(metapath): - with open(metapath) as f: - meta = json.load(f) - if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited: - if verbose: - print("Up to date, skipping", file=sys.stderr) - continue - - meta = { - 'pad_id': padid, - 'group_id': group_id, - 'pad_name': pad_name - } - - meta['last_edited'] = last_edited - - - # Write Text - with open(textpath, "w") as f: - try: - text = padserver.getPadText(padid) - f.write(text.encode("utf-8")) - meta['text_path'] = os.path.relpath(textpath, outputpath) - meta['text_length'] = len(text) - meta['text_length_human'] = humanize_bytes(meta['text_length']) - - except (TypeError, HTTPError, ValueError) as e: - print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) - - with open(htmlpath, "w") as f: - html = padserver.getPadHTML(padid) - meta['html_path'] = os.path.relpath(htmlpath, outputpath) - meta['html_length'] = len(html) - if pad_template: - t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) - body = t.find(".//body") - title = padid - editurl = padserver.getPadURL(padid, groupinfo) - meta['url'] = editurl - json_dump = json.dumps(meta) - f.write(pad_template.render( - body=content(body), - title=title, - editurl=editurl, - sourceurl=textpath, - metadata_json=json_dump).encode("utf-8")) # unicode error HERE! - else: - f.write(html.encode("utf-8")) - - # except (TypeError, HTTPError, ValueError) as e: - # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) - - with open(metapath, "w") as f: - f.write(json.dumps(meta)) - - if sleeptime: - time.sleep(sleeptime) - - if not verbose: - sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads)) - sys.stderr.flush() - - -def humanize_bytes(bytes, precision=0): - """Return a humanized string representation of a number of bytes. - - Assumes `from __future__ import division`. - - >>> humanize_bytes(1) - '1 byte' - >>> humanize_bytes(1024) - '1.0 kB' - >>> humanize_bytes(1024*123) - '123.0 kB' - >>> humanize_bytes(1024*12342) - '12.1 MB' - >>> humanize_bytes(1024*12342,2) - '12.05 MB' - >>> humanize_bytes(1024*1234,2) - '1.21 MB' - >>> humanize_bytes(1024*1234*1111,2) - '1.31 GB' - >>> humanize_bytes(1024*1234*1111,1) - '1.3 GB' - """ - abbrevs = ( - (1<<50L, 'Petabyte'), - (1<<40L, 'Tb'), - (1<<30L, 'Gb'), - (1<<20L, 'Mb'), - (1<<10L, 'kb'), - (1, 'bytes') - ) - if bytes == 1: - return '1 byte' - for factor, suffix in abbrevs: - if bytes >= factor: - break - return '%.*f %s' % (precision, bytes / factor, suffix) - -def padids_from_path (path): - from glob import glob - inputs = glob(os.path.join(path, "*.json")) - inputs.sort() - pads = [] - for fp in inputs: - with open(fp) as f: - info = json.load(f) - info['path'] = fp - pads.append(info) - return pads - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # command - parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') +p = ArgumentParser("") +p.add_argument("padid", help="the padid") +p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") +p.add_argument("--output", default="output", help="path to save files, default: output") +p.add_argument("--verbose", default=False, action="store_true") +p.add_argument("--limit", type=int, default=None) +p.add_argument("--templates", default="templates") +p.add_argument("--template", default="pad_html.html") - # padinfo - parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') - parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') - parser.add_argument('--port', type=int, help='port of etherpad server') - parser.add_argument('--apikey', help='API key') - parser.add_argument('--apiversion', help='the version of the etherpad api') - parser.add_argument('--apiurl', help='URL path to the API') +p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") +p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") +p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") - parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') - parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .') - parser.add_argument('--pubpath', default="pub", help='path to dump public pads') - parser.add_argument('--grouppath', default="priv", help='path to dump group pads') - parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates') +args = p.parse_args() +with open(args.padinfo) as f: + info = json.load(f) - # listpads/groups-specific - parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON') +apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) - # dump-specific - parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date') - parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)') - parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') - - # index-specific - parser.add_argument('--title', default="etherpad index & archive", help='(index) title') - parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups') - parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file') - parser.add_argument('--output', default=None, help='(index) path for output (default stdout)') - - parser.add_argument('--pad', default="start", help='(history) pad id') - parser.add_argument('--rev', default="", help='(history) revision id') - - args = parser.parse_args() - - verbose = args.verbose - padinfo = PADINFO_DEFAULTS - if args.padinfo: - try: - with open(args.padinfo) as f: - for key, value in json.load(f).items(): - padinfo[key] = value - except IOError, e: - print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) - except ValueError, e: - print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) - - # allow explicit opts to override - - if args.hostname: - padinfo['hostname'] = args.hostname - if args.port: - padinfo['port'] = args.port - if args.apikey: - padinfo['apikey'] = args.apikey - if args.apiversion: - padinfo['apiversion'] = args.apiversion - if args.apiurl: - padinfo['apiurl'] = args.apiurl - - padserver = PadServer( - hostname=padinfo.get("hostname"), - port=padinfo.get("port"), - apipath=padinfo.get("apiurl"), - apiversion=padinfo.get("apiversion"), - apikey=padinfo.get("apikey") - ) - - print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr) - - ############################### - # Command Dispatch - ############################### - - cmd = args.command.lower() - if cmd == "listpads": - padids = padserver.listAllPads() - if not args.lines: - json.dump(padids, sys.stdout) - else: - for padid in padids: - print(padid) - - elif cmd == "listgroups": - groupids = padserver.listAllGroups() - if not args.lines: - json.dump(groupids, sys.stdout) - else: - for gid in groupids: - print(gid) - - elif cmd == "dump": - groupinfo = None - if args.groupinfo: - with open(args.groupinfo) as gif: - groupinfo = json.load(gif) - - if verbose: - print ("Using groupinfo", file=sys.stderr) - - start = time.time() - padids = padserver.listAllPads() - if args.skip: - padids = padids[args.skip:] - if args.limit: - padids = padids[:args.limit] - - dumpPads( - padserver, - padids, - args.outputpath, - args.pubpath, - args.grouppath, - force=args.force, - templates=args.templates, - groupinfo=groupinfo) - - if verbose: - print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) - - elif cmd == "index": - - def augment_info(info, groupinfo): - if info.get("last_edited"): - dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" ) - info['last_edited_parsed'] = dt - info['last_edited_str'] = str(dt) - - if groupinfo: - gid = info.get("group_id") - if gid.startswith("g."): - gid = gid[2:] - if gid in groupinfo: - info[u"group_name"] = groupinfo[gid].get("name") - # print (info, file=sys.stderr) - return info - - def get_pads(groupinfo=None): - pads = padids_from_path(os.path.join(args.outputpath, args.pubpath)) - pads = [augment_info(x, groupinfo) for x in pads] - # print (("padids_from_path", args.pubpath, pads), file=sys.stderr) - gp = os.path.join(args.outputpath, args.grouppath) - if not args.exclude_groups and gp: - groups = [os.path.join(gp, x) for x in os.listdir(gp)] - groups = [x for x in groups if os.path.isdir(x)] - groups.sort() - for gp in groups: - if groupinfo: - b = os.path.basename(gp) - if b not in groupinfo: - continue - try: - pad_infos = padids_from_path(gp) - pad_infos = [augment_info(x, groupinfo) for x in pad_infos] - pads.extend(pad_infos) - except OSError: - pass - return pads - - groupinfo = None - if args.groupinfo: - with open(args.groupinfo) as gif: - groupinfo = json.load(gif) - - if verbose: - print ("Using groupinfo", file=sys.stderr) - - pads = get_pads(groupinfo) - padids = [(x.get("pad_name").lower(), x) for x in pads] - padids.sort() - pads = [x[1] for x in padids] - - out = sys.stdout - if args.output: - out = open(os.path.join(args.outputpath, args.output), "w") - - import jinja2 - env = get_template_env(args.templates) - index_template = env.get_template("index.html") - - out.write(index_template.render( - pads = pads, - title = args.title, +padlinkpats = [] +if "padlink" in info: + if type(info['padlink']) == list: + padlinkpats.extend(info['padlink']) + else: + padlinkpats.append(info['padlink']) +padlinkpats.extend(args.padlink) + +linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] +linkpats.extend(zip(args.linksearch, args.linkreplace)) + +if args.verbose: + print ("using padlinkpats", padlinkpats) + +todo = [args.padid] +done = set() +count = 0 + +env = get_template_env(args.templates) +template = env.get_template(args.template) + +while len(todo) > 0: + padid = todo[0] + todo = todo[1:] + done.add(padid) + + data = {} + data['apikey'] = info['apikey'] + data['padID'] = padid.encode("utf-8") + + if args.verbose: + print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) + out = "{0}/{1}".format(args.output, urlify(padid)) + print ("{0}".format(out), file=sys.stderr) + + total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) + total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] + if args.verbose: + print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) + + data['startRev'] = "0" + requesturl = apiurl+'createDiffHTML?'+urlencode(data) + html = json.load(urlopen(requesturl))['data']['html'] + t = html5lib.parse(html, namespaceHTMLElements=False) + trim_removed_spans(t) + html = ET.tostring(t, method="html") + + # Stage 1: Process as text + # Process [[wikilink]] style links + # and add linked page names to spider todo list + html, links = linkify(html) + for l in links: + if l not in todo and l not in done: + if args.verbose: + print (" link: {0}".format(l), file=sys.stderr) + todo.append(l) + + # Stage 2: Process as ElementTree + # + t = html5lib.parse(html, namespaceHTMLElements=False) + # apply linkpats + for a in t.findall(".//a"): + href = a.attrib.get("href") + original_href = href + if href: + # if args.verbose: + # print ("searching for PADLINK: {0}".format(href)) + for pat in padlinkpats: + if re.search(pat, href) != None: + # if args.verbose: + # print (" found PADLINK: {0}".format(href)) + href = re.sub(pat, "\\1.html", href) + padid = filename_to_padid(href) + set_text_contents(a, "[[{0}]]".format(padid)) + if padid not in todo and padid not in done: + if args.verbose: + print (" link: {0}".format(padid), file=sys.stderr) + todo.append(padid) + # apply linkpats + for s, r in linkpats: + href = re.sub(s, r, href) + if href != original_href: + old_contents = text_contents(a) + # print ("OLD_CONTENTS {0}".format(old_contents)) + if old_contents == original_href: + if args.verbose: + print (" Updating href IN TEXT", file=sys.stderr) + set_text_contents(a, href) + + if original_href != href: + if args.verbose: + print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) + a.attrib['href'] = href + + # extract the style tag (with authorship colors) + style = t.find(".//style") + if style != None: + style = ET.tostring(style, method="html") + else: + style = "" + # and extract the contents of the body + html = contents(t.find(".//body")) + + + try: + os.makedirs(args.output) + except OSError: + pass + with open(out, "w") as f: + # f.write(html.encode("utf-8")) + f.write(template.render( + html = html, + style = style, + revision = total_revisions, + padid = padid, timestamp = datetime.now() ).encode("utf-8")) - if args.output: - out.close() - - elif cmd == "revisions": - print (padserver.getRevisionsCount(args.pad)) - - elif cmd == "authors": - print (padserver.listAuthorsOfPad(args.pad)) - - elif cmd == "changeset": - print (padserver.getRevisionChangeset(args.pad, args.rev)) - - elif cmd == "history": - revs = padserver.getRevisionsCount(args.pad) - data = padserver.createDiffHTML(args.pad, 1, revs) - print (data['html']) - - else: - print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr) + count += 1 + if args.limit and count >= args.limit: + break diff --git a/etherdump_original b/etherdump_original new file mode 100755 index 0000000..c8625b5 --- /dev/null +++ b/etherdump_original @@ -0,0 +1,413 @@ +#!/usr/bin/env python + +from __future__ import print_function +import sys, argparse, json, re, os, time +from urllib2 import urlopen, HTTPError, URLError +import html5lib, urllib2, urllib +from xml.etree import ElementTree as ET +from urllib import urlencode +from urlparse import urljoin +from datetime import datetime +from padserver import PadServer + + +PADINFO_DEFAULTS = { + "hostname": "", + "port": 9001, + "apiversion": "1.2.9", + "apiurl": "/api/" +} + +MODULE_PATH = (os.path.dirname(__file__)) +TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates") + +verbose = False + +def pad_split_group (n): + m = re.match(r"g\.(\w+)\$(.+)$", n) + if m: + return m.groups() + else: + return ('', n) + +def content(tag): + if tag.text == None: + return u''.join(ET.tostring(e) for e in tag) + else: + return tag.text + u''.join(ET.tostring(e) for e in tag) + + +def get_template_env (tpath=None): + import jinja2 + paths = [] + if tpath and os.path.isdir(tpath): + paths.append(tpath) + paths.append(TEMPLATES_PATH) + loader = jinja2.FileSystemLoader(paths) + env = jinja2.Environment(loader=loader) + return env + # template = env.get_template('pad.html') + # print template.render(the='variables', go='here').encode("utf-8") + +def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None): + template_env = get_template_env(templates) + pad_template = template_env.get_template("pad.html") + numpads = len(padids) + for i, padid in enumerate(padids): + group_id, pad_name = pad_split_group(padid) + if group_id: + try: + os.mkdir(os.path.join(outputpath, group_path)) + except OSError: + pass + try: + os.mkdir(os.path.join(outputpath, group_path, group_id)) + except OSError: + pass + fp = os.path.join(outputpath, group_path, group_id, pad_name) + else: + try: + os.mkdir(os.path.join(outputpath, pub_path)) + except OSError: + pass + fp = os.path.join(outputpath, pub_path, pad_name) + + if verbose: + print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr) + else: + sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads)) + sys.stderr.flush() + + + textpath = fp + ".txt" + htmlpath = fp+".html" + metapath = fp+".json" + + last_edited = padserver.getPadLastEdited(padid) + if last_edited: + last_edited = last_edited.isoformat() + else: + last_edited = '' + + if os.path.exists(metapath): + with open(metapath) as f: + meta = json.load(f) + if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited: + if verbose: + print("Up to date, skipping", file=sys.stderr) + continue + + meta = { + 'pad_id': padid, + 'group_id': group_id, + 'pad_name': pad_name + } + + meta['last_edited'] = last_edited + + + # Write Text + with open(textpath, "w") as f: + try: + text = padserver.getPadText(padid) + f.write(text.encode("utf-8")) + meta['text_path'] = os.path.relpath(textpath, outputpath) + meta['text_length'] = len(text) + meta['text_length_human'] = humanize_bytes(meta['text_length']) + + except (TypeError, HTTPError, ValueError) as e: + print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + + with open(htmlpath, "w") as f: + html = padserver.getPadHTML(padid) + meta['html_path'] = os.path.relpath(htmlpath, outputpath) + meta['html_length'] = len(html) + if pad_template: + t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) + body = t.find(".//body") + title = padid + editurl = padserver.getPadURL(padid, groupinfo) + meta['url'] = editurl + json_dump = json.dumps(meta) + f.write(pad_template.render( + body=content(body), + title=title, + editurl=editurl, + sourceurl=textpath, + metadata_json=json_dump).encode("utf-8")) # unicode error HERE! + else: + f.write(html.encode("utf-8")) + + # except (TypeError, HTTPError, ValueError) as e: + # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) + + with open(metapath, "w") as f: + f.write(json.dumps(meta)) + + if sleeptime: + time.sleep(sleeptime) + + if not verbose: + sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads)) + sys.stderr.flush() + + +def humanize_bytes(bytes, precision=0): + """Return a humanized string representation of a number of bytes. + + Assumes `from __future__ import division`. + + >>> humanize_bytes(1) + '1 byte' + >>> humanize_bytes(1024) + '1.0 kB' + >>> humanize_bytes(1024*123) + '123.0 kB' + >>> humanize_bytes(1024*12342) + '12.1 MB' + >>> humanize_bytes(1024*12342,2) + '12.05 MB' + >>> humanize_bytes(1024*1234,2) + '1.21 MB' + >>> humanize_bytes(1024*1234*1111,2) + '1.31 GB' + >>> humanize_bytes(1024*1234*1111,1) + '1.3 GB' + """ + abbrevs = ( + (1<<50L, 'Petabyte'), + (1<<40L, 'Tb'), + (1<<30L, 'Gb'), + (1<<20L, 'Mb'), + (1<<10L, 'kb'), + (1, 'bytes') + ) + if bytes == 1: + return '1 byte' + for factor, suffix in abbrevs: + if bytes >= factor: + break + return '%.*f %s' % (precision, bytes / factor, suffix) + +def padids_from_path (path): + from glob import glob + inputs = glob(os.path.join(path, "*.json")) + inputs.sort() + pads = [] + for fp in inputs: + with open(fp) as f: + info = json.load(f) + info['path'] = fp + pads.append(info) + return pads + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # command + parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') + + # padinfo + parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') + parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') + parser.add_argument('--port', type=int, help='port of etherpad server') + parser.add_argument('--apikey', help='API key') + parser.add_argument('--apiversion', help='the version of the etherpad api') + parser.add_argument('--apiurl', help='URL path to the API') + + parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') + parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .') + parser.add_argument('--pubpath', default="pub", help='path to dump public pads') + parser.add_argument('--grouppath', default="priv", help='path to dump group pads') + parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates') + + # listpads/groups-specific + parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON') + + # dump-specific + parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date') + parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)') + parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') + + # index-specific + parser.add_argument('--title', default="etherpad index & archive", help='(index) title') + parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups') + parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file') + parser.add_argument('--output', default=None, help='(index) path for output (default stdout)') + + parser.add_argument('--pad', default="start", help='(history) pad id') + parser.add_argument('--rev', default="", help='(history) revision id') + + args = parser.parse_args() + + verbose = args.verbose + padinfo = PADINFO_DEFAULTS + if args.padinfo: + try: + with open(args.padinfo) as f: + for key, value in json.load(f).items(): + padinfo[key] = value + except IOError, e: + print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr) + except ValueError, e: + print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e)) + + # allow explicit opts to override + + if args.hostname: + padinfo['hostname'] = args.hostname + if args.port: + padinfo['port'] = args.port + if args.apikey: + padinfo['apikey'] = args.apikey + if args.apiversion: + padinfo['apiversion'] = args.apiversion + if args.apiurl: + padinfo['apiurl'] = args.apiurl + + padserver = PadServer( + hostname=padinfo.get("hostname"), + port=padinfo.get("port"), + apipath=padinfo.get("apiurl"), + apiversion=padinfo.get("apiversion"), + apikey=padinfo.get("apikey") + ) + + print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr) + + ############################### + # Command Dispatch + ############################### + + cmd = args.command.lower() + if cmd == "listpads": + padids = padserver.listAllPads() + if not args.lines: + json.dump(padids, sys.stdout) + else: + for padid in padids: + print(padid) + + elif cmd == "listgroups": + groupids = padserver.listAllGroups() + if not args.lines: + json.dump(groupids, sys.stdout) + else: + for gid in groupids: + print(gid) + + elif cmd == "dump": + groupinfo = None + if args.groupinfo: + with open(args.groupinfo) as gif: + groupinfo = json.load(gif) + + if verbose: + print ("Using groupinfo", file=sys.stderr) + + start = time.time() + padids = padserver.listAllPads() + if args.skip: + padids = padids[args.skip:] + if args.limit: + padids = padids[:args.limit] + + dumpPads( + padserver, + padids, + args.outputpath, + args.pubpath, + args.grouppath, + force=args.force, + templates=args.templates, + groupinfo=groupinfo) + + if verbose: + print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr) + + elif cmd == "index": + + def augment_info(info, groupinfo): + if info.get("last_edited"): + dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" ) + info['last_edited_parsed'] = dt + info['last_edited_str'] = str(dt) + + if groupinfo: + gid = info.get("group_id") + if gid.startswith("g."): + gid = gid[2:] + if gid in groupinfo: + info[u"group_name"] = groupinfo[gid].get("name") + # print (info, file=sys.stderr) + return info + + def get_pads(groupinfo=None): + pads = padids_from_path(os.path.join(args.outputpath, args.pubpath)) + pads = [augment_info(x, groupinfo) for x in pads] + # print (("padids_from_path", args.pubpath, pads), file=sys.stderr) + gp = os.path.join(args.outputpath, args.grouppath) + if not args.exclude_groups and gp: + groups = [os.path.join(gp, x) for x in os.listdir(gp)] + groups = [x for x in groups if os.path.isdir(x)] + groups.sort() + for gp in groups: + if groupinfo: + b = os.path.basename(gp) + if b not in groupinfo: + continue + try: + pad_infos = padids_from_path(gp) + pad_infos = [augment_info(x, groupinfo) for x in pad_infos] + pads.extend(pad_infos) + except OSError: + pass + return pads + + groupinfo = None + if args.groupinfo: + with open(args.groupinfo) as gif: + groupinfo = json.load(gif) + + if verbose: + print ("Using groupinfo", file=sys.stderr) + + pads = get_pads(groupinfo) + padids = [(x.get("pad_name").lower(), x) for x in pads] + padids.sort() + pads = [x[1] for x in padids] + + out = sys.stdout + if args.output: + out = open(os.path.join(args.outputpath, args.output), "w") + + import jinja2 + env = get_template_env(args.templates) + index_template = env.get_template("index.html") + + out.write(index_template.render( + pads = pads, + title = args.title, + timestamp = datetime.now() + ).encode("utf-8")) + + if args.output: + out.close() + + elif cmd == "revisions": + print (padserver.getRevisionsCount(args.pad)) + + elif cmd == "authors": + print (padserver.listAuthorsOfPad(args.pad)) + + elif cmd == "changeset": + print (padserver.getRevisionChangeset(args.pad, args.rev)) + + elif cmd == "history": + revs = padserver.getRevisionsCount(args.pad) + data = padserver.createDiffHTML(args.pad, 1, revs) + print (data['html']) + + else: + print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr) diff --git a/linkify.py b/linkify.py index 852435d..b0ca66d 100644 --- a/linkify.py +++ b/linkify.py @@ -21,10 +21,12 @@ def linkify (src, urlify=urlify): contents = strip_tags(m.group(1)) collect.append(contents) link = urlify(contents) + # link = link.split("?", 1)[0] return "[[{1}]]".format(link, contents) # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) - src = re.sub(r"\[\[(.+?)\]\]", s, src) + ## question marks are ignored by etherpad, so split/strip it + src = re.sub(r"\[\[(.+?)(\?.*)?\]\]", s, src) return (src, collect)