Browse Source

new pull, new meta style from live constant etherdumpÄ

add-quote-import
Michael Murtaugh 8 years ago
parent
commit
2f1c5603e2
  1. 3
      README.md
  2. 31
      etherdump/commands/common.py
  3. 46
      etherdump/commands/index.py
  4. 106
      etherdump/commands/pull.py
  5. 54
      etherdump/data/templates/index.html

3
README.md

@ -56,4 +56,5 @@ etherdump sync
why why
------- -------
Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods) Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods)

31
etherdump/commands/common.py

@ -1,7 +1,10 @@
from __future__ import print_function
import re, os, json, sys import re, os, json, sys
from urllib import quote_plus, unquote_plus from urllib import quote_plus, unquote_plus
from math import ceil, floor from math import ceil, floor
from urllib2 import urlopen from urllib2 import urlopen, HTTPError
from time import sleep
groupnamepat = re.compile(r"^g\.(\w+)\$") groupnamepat = re.compile(r"^g\.(\w+)\$")
def splitpadname (padid): def splitpadname (padid):
@ -39,11 +42,27 @@ def padpath2id (path):
else: else:
return p.decode("utf-8") return p.decode("utf-8")
def getjson (url): def getjson (url, max_retry=3, retry_sleep_time=0.5):
f = urlopen(url) ret = {}
data = f.read() ret["_retries"] = 0
f.close() while ret["_retries"] <= max_retry:
return json.loads(data) try:
f = urlopen(url)
data = f.read()
rurl = f.geturl()
f.close()
ret.update(json.loads(data))
ret["_code"] = f.getcode()
if rurl != url:
ret["_url"] = rurl
return ret
except HTTPError as e:
print ("HTTPError {0}".format(e), file=sys.stderr)
ret["_code"] = e.code
ret["_retries"]+=1
if retry_sleep_time:
sleep(retry_sleep_time)
return ret
def loadpadinfo(p): def loadpadinfo(p):
with open(p) as f: with open(p) as f:

46
etherdump/commands/index.py

@ -6,6 +6,8 @@ import json, os, re
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
from jinja2 import FileSystemLoader, Environment from jinja2 import FileSystemLoader, Environment
from datetime import datetime
def group (items, key=lambda x: x): def group (items, key=lambda x: x):
ret = [] ret = []
@ -33,28 +35,38 @@ def main(args):
tmpath = os.path.join(tmpath, "data", "templates") tmpath = os.path.join(tmpath, "data", "templates")
env = Environment(loader=FileSystemLoader(tmpath)) env = Environment(loader=FileSystemLoader(tmpath))
template = env.get_template("pad_index.html") template = env.get_template("index.html")
def base (x):
return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
inputs = args.input inputs = args.input
inputs.sort() inputs.sort()
inputs = [x for x in inputs if os.path.isdir(x)] inputs = group(inputs, base)
def base (x): def loadmeta(paths):
return re.sub(r"(\.html)|(\.diff\.html)|(\.meta\.json)|(\.txt)$", "", x) for p in paths:
if p.endswith(".meta.json"):
with open(p) as f:
return json.load(f)
inputs = map(loadmeta, inputs)
# sort by last edited (reverse)
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8"))
# TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA # TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA
# evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT # evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT
# #
print ("<ol>") # print ("<ol>")
for x in inputs: # for x in inputs:
padid = x # padid = x
metapath = os.path.join(x, "{0}.meta.json".format(padid)) # metapath = os.path.join(x, "{0}.meta.json".format(padid))
if os.path.exists(metapath): # if os.path.exists(metapath):
print ("""<li><a href="{0}">{0}</a></li>""".format(x)) # print ("""<li><a href="{0}">{0}</a></li>""".format(x))
with open(metapath) as f: # with open(metapath) as f:
meta = json.load(f) # meta = json.load(f)
indexpath = os.path.join(x, "index.html") # indexpath = os.path.join(x, "index.html")
with open(indexpath, "w") as f: # with open(indexpath, "w") as f:
print (template.render(**meta).encode("utf-8"), file=f)
# print ("</ol>")
print ("</ol>")

106
etherdump/commands/pull.py

@ -3,7 +3,7 @@ from __future__ import print_function
from argparse import ArgumentParser from argparse import ArgumentParser
import sys, json, re, os import sys, json, re, os
from datetime import datetime from datetime import datetime
from urllib import urlencode from urllib import urlencode, quote
from urllib2 import HTTPError from urllib2 import HTTPError
from common import * from common import *
from time import sleep from time import sleep
@ -26,7 +26,7 @@ def main (args):
p.add_argument("padid", nargs="*", default=[]) p.add_argument("padid", nargs="*", default=[])
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--pub", default=".", help="folder to store files for public pads, default: pub") p.add_argument("--pub", default="p", help="folder to store files for public pads, default: pub")
p.add_argument("--group", default="g", help="folder to store files for group pads, default: g") p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False") p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
@ -34,7 +34,7 @@ def main (args):
p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False") p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False") p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")
p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False") p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
p.add_argument("--folder", default=False, action="store_true", help="dump files to folder named PADID (meta, text, html, dhtml), default: False") p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout") p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous") p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
args = p.parse_args(args) args = p.parse_args(args)
@ -66,10 +66,6 @@ storing enough information to reconstruct (or understand an error occurred)
data['padID'] = padid.encode("utf-8") data['padID'] = padid.encode("utf-8")
p = padpath(padid, args.pub, args.group) p = padpath(padid, args.pub, args.group)
if args.folder: if args.folder:
try:
os.makedirs(p)
except OSError:
pass
p = os.path.join(p, padid.encode("utf-8")) p = os.path.join(p, padid.encode("utf-8"))
metapath = p + ".meta.json" metapath = p + ".meta.json"
@ -77,13 +73,14 @@ storing enough information to reconstruct (or understand an error occurred)
tries = 1 tries = 1
skip = False skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
if type(padurlbase) == unicode: if type(padurlbase) == unicode:
padurlbase = padurlbase.encode("utf-8") padurlbase = padurlbase.encode("utf-8")
while True: while True:
try: try:
if os.path.exists(metapath): if os.path.exists(metapath):
with open(metapath) as f: with open(metapath) as f:
meta = json.load(f) meta.update(json.load(f))
revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
if meta['revisions'] == revisions and not args.force: if meta['revisions'] == revisions and not args.force:
skip=True skip=True
@ -91,11 +88,13 @@ storing enough information to reconstruct (or understand an error occurred)
## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS ## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS
## (or else in surrounding meta data!!) ## (or else in surrounding meta data!!)
meta = {'padid': padid.encode("utf-8")} meta['padid'] = padid.encode("utf-8")
# this should be less of a hack versions = meta["versions"] = []
# TODO TEST!!! versions.append({
"url": padurlbase + padid.encode("utf-8"),
meta["padurl"] = padurlbase + padid.encode("utf-8") "type": "pad",
"code": 200
})
if revisions == None: if revisions == None:
meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
@ -137,48 +136,53 @@ storing enough information to reconstruct (or understand an error occurred)
except OSError: except OSError:
pass pass
if args.all or args.meta:
with open(metapath, "w") as f:
json.dump(meta, f, indent=2)
# Process text, html, dhtml, all options # Process text, html, dhtml, all options
if args.all or args.text:
text = getjson(info['apiurl']+'getText?'+urlencode(data))
text = text['data']['text']
with open(p+".txt", "w") as f:
f.write(text.encode("utf-8"))
# once the content is settled, compute a hash
# and link it in the metadata!
if args.all or args.html: if args.all or args.html:
html = getjson(info['apiurl']+'getHTML?'+urlencode(data)) html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
html = html['data']['html'] ver = {"type": "html"}
with open(p+".html", "w") as f: versions.append(ver)
f.write(html.encode("utf-8")) ver["code"] = html["_code"]
if html["_code"] == 200:
html = html['data']['html']
ver["path"] = p+".raw.html"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(html.encode("utf-8"))
if args.all or args.text:
text = getjson(info['apiurl']+'getText?'+urlencode(data))
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text['data']['text']
ver["path"] = p+".raw.txt"
ver["url"] = quote(ver["path"])
with open(ver["path"], "w") as f:
f.write(text.encode("utf-8"))
# once the content is settled, compute a hash
# and link it in the metadata!
if args.all or args.dhtml: if args.all or args.dhtml:
tries = 0 data['startRev'] = "0"
skip = False html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
while not skip: ver = {"type": "diffhtml"}
try: versions.append(ver)
data['startRev'] = "0" ver["code"] = html["_code"]
html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data)) if html["_code"] == 200:
html = html['data']['html'] html = html['data']['html']
with open(p+".diff.html", "w") as f: ver["path"] = p+".diff.html"
f.write(html.encode("utf-8")) ver["url"] = quote(ver["path"])
break with open(ver["path"], "w") as f:
except HTTPError as e: f.write(html.encode("utf-8"))
print ("HTTPERROR {0}".format(e), file=sys.stderr)
tries += 1 # output meta
if tries >= 5: if args.all or args.meta:
print (" Too many errors, deleting .diff.html and skipping", file=sys.stderr) ver = {"type": "meta"}
try: versions.append(ver)
os.remove(p+".diff.html") ver["path"] = metapath
except OSError: ver["url"] = quote(metapath)
pass with open(metapath, "w") as f:
skip=True json.dump(meta, f, indent=2)
else:
sleep(0.1)
print("\n{0} pad(s) loaded".format(count), file=sys.stderr) print("\n{0} pad(s) loaded".format(count), file=sys.stderr)

54
etherdump/data/templates/index.html

@ -2,23 +2,39 @@
<html> <html>
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<title>{% block title %}{{title}}{% endblock %}</title> <title>etherdump</title>
<link rel="stylesheet" type="text/css" href="{%block css %}styles.css{%endblock%}"> <link rel="stylesheet" type="text/css" href="{%block css %}styles.css{%endblock%}">
{% block scripts %} {% block scripts %}
<script src="jquery-latest.js"></script> <script src="jquery-latest.js"></script>
<script src="jquery.tablesorter.min.js"></script> <script src="jquery.tablesorter.min.js"></script>
<script src="script.js"></script>
{% endblock scripts %} {% endblock scripts %}
</head> </head>
<body> <body>
{% block header %}<h1>{{title}}</h1>{% endblock %} {% block info %}<p class="info">This listing is updated automatically once daily. Last update {{timestamp}}.</p>{% endblock %}
{% block info %}<p class="info">Last updated {{timestamp}}</p>{% endblock %}
{% block namefilter %} {% block namefilter %}
<div id="namefilter"> <div id="namefilter">
<input type="text" id="namefilterinput" value="" placeholder="name filter" autofocus > <input type="text" id="namefilterinput" value="" placeholder="name filter" autofocus >
<button id="newpadbutton">go/start pad</button>
</div> </div>
<script> <script>
var newpadbutton = document.getElementById("newpadbutton");
newpadbutton.addEventListener("click", function (e) {
var elt = document.getElementById("namefilterinput"),
newpadname = elt.value,
padbase = document.querySelector("td.versions a").href;
newpadname = newpadname.replace(/^\s*/g, "");
newpadname = newpadname.replace(/\s*$/g, "");
elt.value = newpadname;
padbase = padbase.replace(/\/[^/]+$/, "");
if (!newpadname) { alert("type the pad name, then click 'go'")}
else {
var newpadhref = padbase + "/" + encodeURIComponent(newpadname);
// console.log("goto", newpadhref);
window.location = newpadhref;
};
e.preventDefault();
})
var namefilter = (function (opts) { var namefilter = (function (opts) {
var timeout_id = null, var timeout_id = null,
filter_value = '', filter_value = '',
@ -27,7 +43,7 @@ var namefilter = (function (opts) {
// console.log("update", filter_value); // console.log("update", filter_value);
var pat = new RegExp(filter_value, "i"); var pat = new RegExp(filter_value, "i");
$("tbody tr").each(function () { $("tbody tr").each(function () {
var n = $(".pad_name", this).text(); var n = $(".name", this).text();
// console.log("n", n); // console.log("n", n);
if (filter_value == "" || n.match(pat) !== null) { if (filter_value == "" || n.match(pat) !== null) {
$(this).show(); $(this).show();
@ -51,7 +67,7 @@ $("#namefilterinput").bind("keyup", function (e) { namefilter($(this).val()); })
$(document).ready(function() $(document).ready(function()
{ {
$("table.listing").tablesorter(); $("table.listing").tablesorter({sortList: [[2,1]]});
} }
); );
@ -61,24 +77,32 @@ $(document).ready(function()
<table class="listing tablesorter"> <table class="listing tablesorter">
<thead> <thead>
<tr> <tr>
<th>link</th> <th>name</th>
<th>pad name (click to view archived page)</th> <th>versions</th>
<th>group</th>
<th>last edited</th> <th>last edited</th>
<th>size</th> <th>revisions</th>
<th>authors</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for pad in pads %} {% for pad in pads %}
<tr> <tr>
<td class="pad_url"><a class="edit" href="{{ pad.url }}">edit</a></td> <td class="name">
<td class="pad_name"><a href="{{ pad.html_path }}">{{ pad.pad_name }}</a></td> {{ pad.padid }}
<td class="pad_group">{{ pad.group_name|default(pad.group_id) }}</td> </td>
<td class="pad_last_edited">{{ pad.last_edited_str }}</td> <td class="versions">
<td class="pad_size">{{ pad.text_length_human }}</td> {% for v in pad.versions %}<a href="{{v.url}}">{{v.type}}</a> {% endfor %}
</td>
<td class="lastedited">{{ pad.lastedited_iso|replace("T", " ") }}</td>
<td class="revisions">{{ pad.revisions }}</td>
<td class="authors">{{ pad.author_ids|length }}</td>
</tr> </tr>
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
<div id="footer">
<a href="index">index of all pads (password required)</a>
</div>
</body> </body>
</html> </html>

Loading…
Cancel
Save