diff --git a/etherdump b/etherdump index c1b0246..295cd49 100755 --- a/etherdump +++ b/etherdump @@ -58,15 +58,16 @@ class PadServer (object): return json.load(urlopen(url))['data']['groupIDs'] def getPadText (self, padID): - data = {'apikey': self.apikey, 'padID': padID} + data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text'] def getPadHTML (self, padID): - data = {'apikey': self.apikey, 'padID': padID} + data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html'] def getPadLastEdited (self, padID): - raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode({'apikey': self.apikey, 'padID': padID})))['data']['lastEdited'] + data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")} + raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode(data)))['data']['lastEdited'] return datetime.fromtimestamp(int(raw)/1000) def getPadURL (self, padID): @@ -88,7 +89,7 @@ def get_template_env (tpath=None): # template = env.get_template('pad.html') # print template.render(the='variables', go='here').encode("utf-8") -def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_existing=False, templates=None): +def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, force=False, templates=None): template_env = get_template_env(templates) pad_template = template_env.get_template("pad.html") numpads = len(padids) @@ -117,21 +118,31 @@ def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_exis sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads)) sys.stderr.flush() - if skip_existing: - if os.path.exists(fp+".json"): - continue - # Write Metadata + textpath = fp + ".txt" + htmlpath = fp+".html" + metapath = fp+".json" + + last_edited = padserver.getPadLastEdited(padid).isoformat() + + if os.path.exists(metapath): + with open(metapath) as f: + meta = json.load(f) + if not force and meta.get("last_edited") == last_edited: + if verbose: + print("Up to date, skipping", file=sys.stderr) + continue + meta = { 'pad_id': padid, 'group_id': group_id, 'pad_name': pad_name } - meta['last_edited'] = padserver.getPadLastEdited(padid).isoformat() + meta['last_edited'] = last_edited + # Write Text - textpath = fp + ".txt" with open(textpath, "w") as f: try: text = padserver.getPadText(padid) @@ -143,7 +154,6 @@ def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_exis except (TypeError, HTTPError, ValueError) as e: print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) - htmlpath = fp+".html" with open(htmlpath, "w") as f: html = padserver.getPadHTML(padid) meta['html_path'] = htmlpath @@ -167,7 +177,7 @@ def dumpPads (padserver, padids, pub_path, group_path, sleeptime=0.01, skip_exis # except (TypeError, HTTPError, ValueError) as e: # print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr) - with open(fp+".json", "w") as f: + with open(metapath, "w") as f: f.write(json.dumps(meta)) if sleeptime: @@ -231,8 +241,10 @@ def padids_from_path (path): if __name__ == "__main__": parser = argparse.ArgumentParser() + # command parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex') + # padinfo parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options') parser.add_argument('--hostname', default="", help='the hostname of the etherpad server') parser.add_argument('--port', type=int, help='port of etherpad server') @@ -243,16 +255,20 @@ if __name__ == "__main__": parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output') parser.add_argument('--pubpath', default="pub", help='path to dump public pads') parser.add_argument('--grouppath', default="priv", help='path to dump group pads') - parser.add_argument('--lines', default=False, action="store_true", help='output one per line instead of JSON') - parser.add_argument('--skip-existing', default=False, action="store_true", help='skip existing files on dump') - parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') - parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates') - # INDEX-specific opts + # listpads/groups-specific + parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON') + + # dump-specific + parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date') + parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)') + parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items') + + # index-specific parser.add_argument('--title', default="etherpad index & archive", help='(index) title') parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups') - parser.add_argument('--groupinfo', default=None, help='(createindex) groupinfo json file') + parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file') parser.add_argument('--output', default=None, help='(index) path for output (default stdout)') @@ -318,6 +334,8 @@ if __name__ == "__main__": elif cmd == "dump": start = time.time() padids = padserver.listAllPads() + if args.skip: + padids = padids[args.skip:] if args.limit: padids = padids[:args.limit] @@ -326,7 +344,7 @@ if __name__ == "__main__": padids, args.pubpath, args.grouppath, - args.skip_existing, + force=args.force, templates=args.templates) if verbose: @@ -334,8 +352,24 @@ if __name__ == "__main__": elif cmd == "index": + def augment_info(info, groupinfo): + if info.get("last_edited") != None: + dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" ) + info['last_edited_parsed'] = dt + info['last_edited_str'] = str(dt) + + if groupinfo: + gid = info.get("group_id") + if gid.startswith("g."): + gid = gid[2:] + if gid in groupinfo: + info[u"group_name"] = groupinfo[gid].get("name") + # print (info, file=sys.stderr) + return info + def get_pads(groupinfo=None): pads = padids_from_path(args.pubpath) + pads = [augment_info(x, groupinfo) for x in pads] # print (("padids_from_path", args.pubpath, pads), file=sys.stderr) if not args.exclude_groups and os.path.exists(args.grouppath): groups = [os.path.join(args.grouppath, x) for x in os.listdir(args.grouppath)] @@ -347,7 +381,9 @@ if __name__ == "__main__": if b not in groupinfo: continue try: - pads.extend(padids_from_path(gp)) + pad_infos = padids_from_path(gp) + pad_infos = [augment_info(x, groupinfo) for x in pad_infos] + pads.extend(pad_infos) except OSError: pass return pads @@ -357,6 +393,9 @@ if __name__ == "__main__": with open(args.groupinfo) as gif: groupinfo = json.load(gif) + if verbose: + print ("Using groupinfo", file=sys.stderr) + pads = get_pads(groupinfo) padids = [(x.get("pad_name").lower(), x) for x in pads] padids.sort() @@ -372,7 +411,8 @@ if __name__ == "__main__": out.write(index_template.render( pads = pads, - title = args.title + title = args.title, + timestamp = datetime.now() ).encode("utf-8")) if args.output: diff --git a/templates/index.html b/templates/index.html index 77a5b78..85a90bc 100644 --- a/templates/index.html +++ b/templates/index.html @@ -5,16 +5,17 @@ {% block title %}{{title}}{% endblock %} {% block scripts %} - - + + {% endblock scripts %} {% block header %}

{{title}}

{% endblock %} +{% block info %}

Last updated {{timestamp}}

{% endblock %} {% block namefilter %}
- +
{% endblock %} - +
- + @@ -66,8 +73,8 @@ $("#namefilterinput").on("keyup", function (e) { namefilter($(this).val()); }) - - + + {% endfor %}
linkpad namepad name (click to view archived page) group last edited size
edit {{ pad.pad_name }}{{ pad.group_id }}{{ pad.last_edited }}{{ pad.group_name|default(pad.group_id) }}{{ pad.last_edited_str }} {{ pad.text_length_human }}