From 2f1c5603e223cd5ae6f339f08c0faea6d754289b Mon Sep 17 00:00:00 2001
From: Michael Murtaugh <mm@automatist.org>
Date: Fri, 8 Jan 2016 12:09:05 +0100
Subject: [PATCH] =?UTF-8?q?new=20pull,=20new=20meta=20style=20from=20live?=
 =?UTF-8?q?=20constant=20etherdump=C3=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                           |   3 +-
 etherdump/commands/common.py        |  31 ++++++--
 etherdump/commands/index.py         |  46 +++++++-----
 etherdump/commands/pull.py          | 106 +++++++++++++++-------------
 etherdump/data/templates/index.html |  54 ++++++++++----
 5 files changed, 150 insertions(+), 90 deletions(-)

diff --git a/README.md b/README.md
index f3c73dd..8f04e1a 100644
--- a/README.md
+++ b/README.md
@@ -56,4 +56,5 @@ etherdump sync
 
 why
 -------
-Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods)
\ No newline at end of file
+Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods)
+
diff --git a/etherdump/commands/common.py b/etherdump/commands/common.py
index 819730d..7e08b8f 100644
--- a/etherdump/commands/common.py
+++ b/etherdump/commands/common.py
@@ -1,7 +1,10 @@
+from __future__ import print_function
 import re, os, json, sys
 from urllib import quote_plus, unquote_plus
 from math import ceil, floor
-from urllib2 import urlopen
+from urllib2 import urlopen, HTTPError
+from time import sleep
+
 
 groupnamepat = re.compile(r"^g\.(\w+)\$")
 def splitpadname (padid):
@@ -39,11 +42,27 @@ def padpath2id (path):
     else:
         return p.decode("utf-8")
 
-def getjson (url):
-    f = urlopen(url)
-    data = f.read()
-    f.close()
-    return json.loads(data)
+def getjson (url, max_retry=3, retry_sleep_time=0.5):
+    ret = {}
+    ret["_retries"] = 0
+    while ret["_retries"] <= max_retry:
+        try:
+            f = urlopen(url)
+            data = f.read()
+            rurl = f.geturl()
+            f.close()
+            ret.update(json.loads(data))
+            ret["_code"] = f.getcode()
+            if rurl != url:
+                ret["_url"] = rurl
+            return ret
+        except HTTPError as e:
+            print ("HTTPError {0}".format(e), file=sys.stderr)
+            ret["_code"] = e.code
+            ret["_retries"]+=1
+            if retry_sleep_time:
+                sleep(retry_sleep_time)
+    return ret
 
 def loadpadinfo(p):
     with open(p) as f:
diff --git a/etherdump/commands/index.py b/etherdump/commands/index.py
index 4d5af65..f2490e4 100644
--- a/etherdump/commands/index.py
+++ b/etherdump/commands/index.py
@@ -6,6 +6,8 @@ import json, os, re
 from urllib import urlencode
 from urllib2 import urlopen, HTTPError, URLError
 from jinja2 import FileSystemLoader, Environment
+from datetime import datetime
+
 
 def group (items, key=lambda x: x):
     ret = []
@@ -33,28 +35,38 @@ def main(args):
         tmpath = os.path.join(tmpath, "data", "templates")
 
     env = Environment(loader=FileSystemLoader(tmpath))
-    template = env.get_template("pad_index.html")
+    template = env.get_template("index.html")
+
+    def base (x):
+        return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
 
     inputs = args.input
     inputs.sort()
-    inputs = [x for x in inputs if os.path.isdir(x)]
+    inputs = group(inputs, base)
 
-    def base (x):
-        return re.sub(r"(\.html)|(\.diff\.html)|(\.meta\.json)|(\.txt)$", "", x)
+    def loadmeta(paths):
+        for p in paths:
+            if p.endswith(".meta.json"):
+                with open(p) as f:
+                    return json.load(f)
+
+    inputs = map(loadmeta, inputs)
+    # sort by last edited (reverse)
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8"))
 
     # TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA
     # evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT
     # 
-    print ("<ol>")
-    for x in inputs:
-        padid = x
-        metapath = os.path.join(x, "{0}.meta.json".format(padid))
-        if os.path.exists(metapath):
-            print ("""<li><a href="{0}">{0}</a></li>""".format(x))
-            with open(metapath) as f:
-                meta = json.load(f)
-            indexpath = os.path.join(x, "index.html")
-            with open(indexpath, "w") as f:
-                print (template.render(**meta).encode("utf-8"), file=f)
-
-    print ("</ol>")
+    # print ("<ol>")
+    # for x in inputs:
+    #     padid = x
+    #     metapath = os.path.join(x, "{0}.meta.json".format(padid))
+    #     if os.path.exists(metapath):
+    #         print ("""<li><a href="{0}">{0}</a></li>""".format(x))
+    #         with open(metapath) as f:
+    #             meta = json.load(f)
+    #         indexpath = os.path.join(x, "index.html")
+    #         with open(indexpath, "w") as f:
+
+    # print ("</ol>")
diff --git a/etherdump/commands/pull.py b/etherdump/commands/pull.py
index b78df0f..2f9b7fb 100644
--- a/etherdump/commands/pull.py
+++ b/etherdump/commands/pull.py
@@ -3,7 +3,7 @@ from __future__ import print_function
 from argparse import ArgumentParser
 import sys, json, re, os
 from datetime import datetime
-from urllib import urlencode
+from urllib import urlencode, quote
 from urllib2 import HTTPError
 from common import *
 from time import sleep
@@ -26,7 +26,7 @@ def main (args):
     p.add_argument("padid", nargs="*", default=[])
     p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
     p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
-    p.add_argument("--pub", default=".", help="folder to store files for public pads, default: pub")
+    p.add_argument("--pub", default="p", help="folder to store files for public pads, default: pub")
     p.add_argument("--group", default="g", help="folder to store files for group pads, default: g")
     p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
     p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False")
@@ -34,7 +34,7 @@ def main (args):
     p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False")
     p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False")
     p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False")
-    p.add_argument("--folder", default=False, action="store_true", help="dump files to folder named PADID (meta, text, html, dhtml), default: False")
+    p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False")
     p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout")
     p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous")
     args = p.parse_args(args)
@@ -66,10 +66,6 @@ storing enough information to reconstruct (or understand an error occurred)
         data['padID'] = padid.encode("utf-8")
         p = padpath(padid, args.pub, args.group)
         if args.folder:
-            try:
-                os.makedirs(p)
-            except OSError:
-                pass
             p = os.path.join(p, padid.encode("utf-8"))
 
         metapath = p + ".meta.json"
@@ -77,13 +73,14 @@ storing enough information to reconstruct (or understand an error occurred)
         tries = 1
         skip = False
         padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
+        meta = {}
         if type(padurlbase) == unicode:
             padurlbase = padurlbase.encode("utf-8")
         while True:
             try:
                 if os.path.exists(metapath):
                     with open(metapath) as f:
-                        meta = json.load(f)
+                        meta.update(json.load(f))
                     revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
                     if meta['revisions'] == revisions and not args.force:
                         skip=True
@@ -91,11 +88,13 @@ storing enough information to reconstruct (or understand an error occurred)
                 
                 ## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS
                 ## (or else in surrounding meta data!!)
-                meta = {'padid': padid.encode("utf-8")}
-                # this should be less of a hack
-                # TODO TEST!!!
-
-                meta["padurl"] = padurlbase + padid.encode("utf-8")
+                meta['padid'] = padid.encode("utf-8")
+                versions = meta["versions"] = []
+                versions.append({
+                    "url": padurlbase + padid.encode("utf-8"),
+                    "type": "pad",
+                    "code": 200
+                })
 
                 if revisions == None:
                     meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions']
@@ -137,48 +136,53 @@ storing enough information to reconstruct (or understand an error occurred)
             except OSError:
                 pass
 
-        if args.all or args.meta:
-            with open(metapath, "w") as f:
-                json.dump(meta, f, indent=2)
-
         # Process text, html, dhtml, all options
-        if args.all or args.text:
-            text = getjson(info['apiurl']+'getText?'+urlencode(data))
-            text = text['data']['text']
-            with open(p+".txt", "w") as f:
-                f.write(text.encode("utf-8"))
-            # once the content is settled, compute a hash
-            # and link it in the metadata!
-
-
         if args.all or args.html:
             html = getjson(info['apiurl']+'getHTML?'+urlencode(data))
-            html = html['data']['html']
-            with open(p+".html", "w") as f:
-                f.write(html.encode("utf-8"))
+            ver = {"type": "html"}
+            versions.append(ver)
+            ver["code"] = html["_code"]
+            if html["_code"] == 200:
+                html = html['data']['html']
+                ver["path"] = p+".raw.html"
+                ver["url"] = quote(ver["path"])
+                with open(ver["path"], "w") as f:
+                    f.write(html.encode("utf-8"))
+
+        if args.all or args.text:
+            text = getjson(info['apiurl']+'getText?'+urlencode(data))
+            ver = {"type": "text"}
+            versions.append(ver)
+            ver["code"] = text["_code"]
+            if text["_code"] == 200:
+                text = text['data']['text']
+                ver["path"] = p+".raw.txt"
+                ver["url"] = quote(ver["path"])
+                with open(ver["path"], "w") as f:
+                    f.write(text.encode("utf-8"))
+                # once the content is settled, compute a hash
+                # and link it in the metadata!
 
         if args.all or args.dhtml:
-            tries = 0
-            skip = False
-            while not skip:
-                try:
-                    data['startRev'] = "0"
-                    html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
-                    html = html['data']['html']
-                    with open(p+".diff.html", "w") as f:
-                        f.write(html.encode("utf-8"))
-                    break
-                except HTTPError as e:
-                    print ("HTTPERROR {0}".format(e), file=sys.stderr)
-                    tries += 1
-                    if tries >= 5:
-                        print ("  Too many errors, deleting .diff.html and skipping", file=sys.stderr)
-                        try:
-                            os.remove(p+".diff.html")
-                        except OSError:
-                            pass
-                        skip=True
-                    else:
-                        sleep(0.1)
+            data['startRev'] = "0"
+            html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data))
+            ver = {"type": "diffhtml"}
+            versions.append(ver)
+            ver["code"] = html["_code"] 
+            if html["_code"] == 200:
+                html = html['data']['html']
+                ver["path"] = p+".diff.html"
+                ver["url"] = quote(ver["path"])
+                with open(ver["path"], "w") as f:
+                    f.write(html.encode("utf-8"))
+
+        # output meta
+        if args.all or args.meta:
+            ver = {"type": "meta"}
+            versions.append(ver)
+            ver["path"] = metapath
+            ver["url"] = quote(metapath)
+            with open(metapath, "w") as f:
+                json.dump(meta, f, indent=2)
 
     print("\n{0} pad(s) loaded".format(count), file=sys.stderr)
diff --git a/etherdump/data/templates/index.html b/etherdump/data/templates/index.html
index 85a90bc..f73109f 100644
--- a/etherdump/data/templates/index.html
+++ b/etherdump/data/templates/index.html
@@ -2,23 +2,39 @@
 <html>
 <head>
 	<meta charset="utf-8" />
-	<title>{% block title %}{{title}}{% endblock %}</title>
+	<title>etherdump</title>
 	<link rel="stylesheet" type="text/css" href="{%block css %}styles.css{%endblock%}">
 	{% block scripts %}
 	<script src="jquery-latest.js"></script>
 	<script src="jquery.tablesorter.min.js"></script>
-	<script src="script.js"></script>
 	{% endblock scripts %}
 </head>
 <body>
-{% block header %}<h1>{{title}}</h1>{% endblock %}
-{% block info %}<p class="info">Last updated {{timestamp}}</p>{% endblock %}
+{% block info %}<p class="info">This listing is updated automatically once daily. Last update {{timestamp}}.</p>{% endblock %}
 {% block namefilter %}
 <div id="namefilter">
 <input type="text" id="namefilterinput" value="" placeholder="name filter" autofocus >
+<button id="newpadbutton">go/start pad</button>
 </div>
 <script>
 
+var newpadbutton = document.getElementById("newpadbutton");
+newpadbutton.addEventListener("click", function (e) {
+	var elt = document.getElementById("namefilterinput"),
+		newpadname = elt.value,
+		padbase = document.querySelector("td.versions a").href;
+	newpadname = newpadname.replace(/^\s*/g, "");
+	newpadname = newpadname.replace(/\s*$/g, "");
+	elt.value = newpadname;
+	padbase = padbase.replace(/\/[^/]+$/, "");
+	if (!newpadname) { alert("type the pad name, then click 'go'")}
+	else {
+		var newpadhref = padbase + "/" + encodeURIComponent(newpadname);
+		// console.log("goto", newpadhref);
+		window.location = newpadhref;
+	};
+	e.preventDefault();
+})
 var namefilter = (function (opts) {
 	var timeout_id = null,
 		filter_value = '',
@@ -27,7 +43,7 @@ var namefilter = (function (opts) {
 		// console.log("update", filter_value);
 		var pat = new RegExp(filter_value, "i");
 		$("tbody tr").each(function () {
-			var n = $(".pad_name", this).text();
+			var n = $(".name", this).text();
 			// console.log("n", n);
 			if (filter_value == "" || n.match(pat) !== null) {
 				$(this).show();
@@ -51,7 +67,7 @@ $("#namefilterinput").bind("keyup", function (e) { namefilter($(this).val()); })
 
 $(document).ready(function() 
     { 
-        $("table.listing").tablesorter(); 
+        $("table.listing").tablesorter({sortList: [[2,1]]}); 
     } 
 ); 
 
@@ -61,24 +77,32 @@ $(document).ready(function()
 <table class="listing tablesorter">
 <thead>
 <tr>
-	<th>link</th>
-	<th>pad name (click to view archived page)</th>
-	<th>group</th>
+	<th>name</th>
+	<th>versions</th>
 	<th>last edited</th>
-	<th>size</th>
+	<th>revisions</th>
+	<th>authors</th>
 </tr>
 </thead>
 <tbody>
 {% for pad in pads %}
 <tr>
-	<td class="pad_url"><a class="edit" href="{{ pad.url }}">edit</a></td>
-	<td class="pad_name"><a href="{{ pad.html_path }}">{{ pad.pad_name }}</a></td>
-	<td class="pad_group">{{ pad.group_name|default(pad.group_id) }}</td>
-	<td class="pad_last_edited">{{ pad.last_edited_str }}</td>
-	<td class="pad_size">{{ pad.text_length_human }}</td>
+	<td class="name">
+		{{ pad.padid }}
+	</td>
+	<td class="versions"> 
+		{% for v in pad.versions %}<a href="{{v.url}}">{{v.type}}</a> {% endfor %}
+	</td>
+	<td class="lastedited">{{ pad.lastedited_iso|replace("T", " ") }}</td>
+	<td class="revisions">{{ pad.revisions }}</td>
+	<td class="authors">{{ pad.author_ids|length }}</td>
 </tr>
 {% endfor %}
 </tbody>
 </table>
+
+<div id="footer">
+	<a href="index">index of all pads (password required)</a>
+</div>
 </body>
 </html>