new pull, new meta style from live constant etherdumpÄ

This commit is contained in:
Michael Murtaugh 2016-01-08 12:09:05 +01:00
parent 8d5ebd6f01
commit 2f1c5603e2
5 changed files with 151 additions and 91 deletions

View File

@ -56,4 +56,5 @@ etherdump sync
why why
------- -------
Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods) Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods)

View File

@ -1,7 +1,10 @@
from __future__ import print_function
import re, os, json, sys import re, os, json, sys
from urllib import quote_plus, unquote_plus from urllib import quote_plus, unquote_plus
from math import ceil, floor from math import ceil, floor
from urllib2 import urlopen from urllib2 import urlopen, HTTPError
from time import sleep
groupnamepat = re.compile(r"^g\.(\w+)\$") groupnamepat = re.compile(r"^g\.(\w+)\$")
def splitpadname (padid): def splitpadname (padid):
@ -39,11 +42,27 @@ def padpath2id (path):
else: else:
return p.decode("utf-8") return p.decode("utf-8")
def getjson (url): def getjson (url, max_retry=3, retry_sleep_time=0.5):
f = urlopen(url) ret = {}
data = f.read() ret["_retries"] = 0
f.close() while ret["_retries"] <= max_retry:
return json.loads(data) try:
f = urlopen(url)
data = f.read()
rurl = f.geturl()
f.close()
ret.update(json.loads(data))
ret["_code"] = f.getcode()
if rurl != url:
ret["_url"] = rurl
return ret
except HTTPError as e:
print ("HTTPError {0}".format(e), file=sys.stderr)
ret["_code"] = e.code
ret["_retries"]+=1
if retry_sleep_time:
sleep(retry_sleep_time)
return ret
def loadpadinfo(p): def loadpadinfo(p):
with open(p) as f: with open(p) as f:

View File

@ -6,6 +6,8 @@ import json, os, re
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
from jinja2 import FileSystemLoader, Environment from jinja2 import FileSystemLoader, Environment
from datetime import datetime
def group (items, key=lambda x: x): def group (items, key=lambda x: x):
ret = [] ret = []
@ -33,28 +35,38 @@ def main(args):
tmpath = os.path.join(tmpath, "data", "templates") tmpath = os.path.join(tmpath, "data", "templates")
env = Environment(loader=FileSystemLoader(tmpath)) env = Environment(loader=FileSystemLoader(tmpath))
template = env.get_template("pad_index.html") template = env.get_template("index.html")
def base (x):
return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
inputs = args.input inputs = args.input
inputs.sort() inputs.sort()
inputs = [x for x in inputs if os.path.isdir(x)] inputs = group(inputs, base)
def base (x): def loadmeta(paths):
return re.sub(r"(\.html)|(\.diff\.html)|(\.meta\.json)|(\.txt)$", "", x) for p in paths:
if p.endswith(".meta.json"):
with open(p) as f:
return json.load(f)
inputs = map(loadmeta, inputs)
# sort by last edited (reverse)
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8"))
# TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA # TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA
# evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT # evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT
# #
print ("<ol>") # print ("<ol>")
for x in inputs: # for x in inputs:
padid = x # padid = x
metapath = os.path.join(x, "{0}.meta.json".format(padid)) # metapath = os.path.join(x, "{0}.meta.json".format(padid))
if os.path.exists(metapath): # if os.path.exists(metapath):
print ("""<li><a href="{0}">{0}</a></li>""".format(x)) # print ("""<li><a href="{0}">{0}</a></li>""".format(x))
with open(metapath) as f: # with open(metapath) as f:
meta = json.load(f) # meta = json.load(f)
indexpath = os.path.join(x, "index.html") # indexpath = os.path.join(x, "index.html")
with open(indexpath, "w") as f: # with open(indexpath, "w") as f:
print (template.render(**meta).encode("utf-8"), file=f)
print ("</ol>") # print ("</ol>")

View File

@ -3,7 +3,7 @@ from __future__ import print_function
from argparse import ArgumentParser from argparse import ArgumentParser
import sys, json, re, os import sys, json, re, os
from datetime import datetime from datetime import datetime
from urllib import urlencode from urllib import urlencode, quote
from urllib2 import HTTPError from urllib2 import HTTPError
from common import * from common import *
from time import sleep from time import sleep
@ -26,7 +26,7 @@ def main (args):
p.add_argument("padid", nargs="*", default=[]) p.add_argument("padid", nargs="*", default=[])
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--pub", default=".", help="folder to store files for public pads, default: pub") p.add_argument("--pub", default="p", help="folder to store files for public pads, default: pub")
p.add_argument("--group", default="g", help="folder to store files for group pads, default: g") p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False") p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
@ -34,7 +34,7 @@ def main (args):
p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False") p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False") p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")
p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False") p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
p.add_argument("--folder", default=False, action="store_true", help="dump files to folder named PADID (meta, text, html, dhtml), default: False") p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout") p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous") p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
args = p.parse_args(args) args = p.parse_args(args)
@ -66,10 +66,6 @@ storing enough information to reconstruct (or understand an error occurred)
data['padID'] = padid.encode("utf-8") data['padID'] = padid.encode("utf-8")
p = padpath(padid, args.pub, args.group) p = padpath(padid, args.pub, args.group)
if args.folder: if args.folder:
try:
os.makedirs(p)
except OSError:
pass
p = os.path.join(p, padid.encode("utf-8")) p = os.path.join(p, padid.encode("utf-8"))
metapath = p + ".meta.json" metapath = p + ".meta.json"
@ -77,13 +73,14 @@ storing enough information to reconstruct (or understand an error occurred)
tries = 1 tries = 1
skip = False skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
if type(padurlbase) == unicode: if type(padurlbase) == unicode:
padurlbase = padurlbase.encode("utf-8") padurlbase = padurlbase.encode("utf-8")
while True: while True:
try: try:
if os.path.exists(metapath): if os.path.exists(metapath):
with open(metapath) as f: with open(metapath) as f:
meta = json.load(f) meta.update(json.load(f))
revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
if meta['revisions'] == revisions and not args.force: if meta['revisions'] == revisions and not args.force:
skip=True skip=True
@ -91,11 +88,13 @@ storing enough information to reconstruct (or understand an error occurred)
## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS ## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS
## (or else in surrounding meta data!!) ## (or else in surrounding meta data!!)
meta = {'padid': padid.encode("utf-8")} meta['padid'] = padid.encode("utf-8")
# this should be less of a hack versions = meta["versions"] = []
# TODO TEST!!! versions.append({
"url": padurlbase + padid.encode("utf-8"),
meta["padurl"] = padurlbase + padid.encode("utf-8") "type": "pad",
"code": 200
})
if revisions == None: if revisions == None:
meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
@ -137,48 +136,53 @@ storing enough information to reconstruct (or understand an error occurred)
except OSError: except OSError:
pass pass
# Process text, html, dhtml, all options
if args.all or args.html:
html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(html.encode("utf-8"))
if args.all or args.text:
text = getjson(info['apiurl']+'getText?'+urlencode(data))
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text['data']['text']
ver["path"] = p+".raw.txt"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(text.encode("utf-8"))
# once the content is settled, compute a hash
# and link it in the metadata!
if args.all or args.dhtml:
data['startRev'] = "0"
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".diff.html"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(html.encode("utf-8"))
# output meta
if args.all or args.meta: if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
with open(metapath, "w") as f: with open(metapath, "w") as f:
json.dump(meta, f, indent=2) json.dump(meta, f, indent=2)
# Process text, html, dhtml, all options
if args.all or args.text:
text = getjson(info['apiurl']+'getText?'+urlencode(data))
text = text['data']['text']
with open(p+".txt", "w") as f:
f.write(text.encode("utf-8"))
# once the content is settled, compute a hash
# and link it in the metadata!
if args.all or args.html:
html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
html = html['data']['html']
with open(p+".html", "w") as f:
f.write(html.encode("utf-8"))
if args.all or args.dhtml:
tries = 0
skip = False
while not skip:
try:
data['startRev'] = "0"
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
html = html['data']['html']
with open(p+".diff.html", "w") as f:
f.write(html.encode("utf-8"))
break
except HTTPError as e:
print ("HTTPERROR {0}".format(e), file=sys.stderr)
tries += 1
if tries >= 5:
print (" Too many errors, deleting .diff.html and skipping", file=sys.stderr)
try:
os.remove(p+".diff.html")
except OSError:
pass
skip=True
else:
sleep(0.1)
print("\n{0} pad(s) loaded".format(count), file=sys.stderr) print("\n{0} pad(s) loaded".format(count), file=sys.stderr)

View File

@ -2,23 +2,39 @@
<html> <html>
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<title>{% block title %}{{title}}{% endblock %}</title> <title>etherdump</title>
<link rel="stylesheet" type="text/css" href="{%block css %}styles.css{%endblock%}"> <link rel="stylesheet" type="text/css" href="{%block css %}styles.css{%endblock%}">
{% block scripts %} {% block scripts %}
<script src="jquery-latest.js"></script> <script src="jquery-latest.js"></script>
<script src="jquery.tablesorter.min.js"></script> <script src="jquery.tablesorter.min.js"></script>
<script src="script.js"></script>
{% endblock scripts %} {% endblock scripts %}
</head> </head>
<body> <body>
{% block header %}<h1>{{title}}</h1>{% endblock %} {% block info %}<p class="info">This listing is updated automatically once daily. Last update {{timestamp}}.</p>{% endblock %}
{% block info %}<p class="info">Last updated {{timestamp}}</p>{% endblock %}
{% block namefilter %} {% block namefilter %}
<div id="namefilter"> <div id="namefilter">
<input type="text" id="namefilterinput" value="" placeholder="name filter" autofocus > <input type="text" id="namefilterinput" value="" placeholder="name filter" autofocus >
<button id="newpadbutton">go/start pad</button>
</div> </div>
<script> <script>
var newpadbutton = document.getElementById("newpadbutton");
newpadbutton.addEventListener("click", function (e) {
var elt = document.getElementById("namefilterinput"),
newpadname = elt.value,
padbase = document.querySelector("td.versions a").href;
newpadname = newpadname.replace(/^\s*/g, "");
newpadname = newpadname.replace(/\s*$/g, "");
elt.value = newpadname;
padbase = padbase.replace(/\/[^/]+$/, "");
if (!newpadname) { alert("type the pad name, then click 'go'")}
else {
var newpadhref = padbase + "/" + encodeURIComponent(newpadname);
// console.log("goto", newpadhref);
window.location = newpadhref;
};
e.preventDefault();
})
var namefilter = (function (opts) { var namefilter = (function (opts) {
var timeout_id = null, var timeout_id = null,
filter_value = '', filter_value = '',
@ -27,7 +43,7 @@ var namefilter = (function (opts) {
// console.log("update", filter_value); // console.log("update", filter_value);
var pat = new RegExp(filter_value, "i"); var pat = new RegExp(filter_value, "i");
$("tbody tr").each(function () { $("tbody tr").each(function () {
var n = $(".pad_name", this).text(); var n = $(".name", this).text();
// console.log("n", n); // console.log("n", n);
if (filter_value == "" || n.match(pat) !== null) { if (filter_value == "" || n.match(pat) !== null) {
$(this).show(); $(this).show();
@ -51,7 +67,7 @@ $("#namefilterinput").bind("keyup", function (e) { namefilter($(this).val()); })
$(document).ready(function() $(document).ready(function()
{ {
$("table.listing").tablesorter(); $("table.listing").tablesorter({sortList: [[2,1]]});
} }
); );
@ -61,24 +77,32 @@ $(document).ready(function()
<table class="listing tablesorter"> <table class="listing tablesorter">
<thead> <thead>
<tr> <tr>
<th>link</th> <th>name</th>
<th>pad name (click to view archived page)</th> <th>versions</th>
<th>group</th>
<th>last edited</th> <th>last edited</th>
<th>size</th> <th>revisions</th>
<th>authors</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for pad in pads %} {% for pad in pads %}
<tr> <tr>
<td class="pad_url"><a class="edit" href="{{ pad.url }}">edit</a></td> <td class="name">
<td class="pad_name"><a href="{{ pad.html_path }}">{{ pad.pad_name }}</a></td> {{ pad.padid }}
<td class="pad_group">{{ pad.group_name|default(pad.group_id) }}</td> </td>
<td class="pad_last_edited">{{ pad.last_edited_str }}</td> <td class="versions">
<td class="pad_size">{{ pad.text_length_human }}</td> {% for v in pad.versions %}<a href="{{v.url}}">{{v.type}}</a> {% endfor %}
</td>
<td class="lastedited">{{ pad.lastedited_iso|replace("T", " ") }}</td>
<td class="revisions">{{ pad.revisions }}</td>
<td class="authors">{{ pad.author_ids|length }}</td>
</tr> </tr>
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
<div id="footer">
<a href="index">index of all pads (password required)</a>
</div>
</body> </body>
</html> </html>