From 2f1c5603e223cd5ae6f339f08c0faea6d754289b Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Fri, 8 Jan 2016 12:09:05 +0100 Subject: [PATCH] =?UTF-8?q?new=20pull,=20new=20meta=20style=20from=20live?= =?UTF-8?q?=20constant=20etherdump=C3=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +- etherdump/commands/common.py | 31 ++++++-- etherdump/commands/index.py | 46 +++++++----- etherdump/commands/pull.py | 106 +++++++++++++++------------- etherdump/data/templates/index.html | 54 ++++++++++---- 5 files changed, 150 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index f3c73dd..8f04e1a 100644 --- a/README.md +++ b/README.md @@ -56,4 +56,5 @@ etherdump sync why ------- -Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods) \ No newline at end of file +Etherdump is useful as a means of dumping the contents of etherpad to files, as a way of opening up the contents of the service to other services / methods / uses / tools / situations. (Files also of course allow for archival tools / methods) + diff --git a/etherdump/commands/common.py b/etherdump/commands/common.py index 819730d..7e08b8f 100644 --- a/etherdump/commands/common.py +++ b/etherdump/commands/common.py @@ -1,7 +1,10 @@ +from __future__ import print_function import re, os, json, sys from urllib import quote_plus, unquote_plus from math import ceil, floor -from urllib2 import urlopen +from urllib2 import urlopen, HTTPError +from time import sleep + groupnamepat = re.compile(r"^g\.(\w+)\$") def splitpadname (padid): @@ -39,11 +42,27 @@ def padpath2id (path): else: return p.decode("utf-8") -def getjson (url): - f = urlopen(url) - data = f.read() - f.close() - return json.loads(data) +def getjson (url, max_retry=3, retry_sleep_time=0.5): + ret = {} + ret["_retries"] = 0 + while ret["_retries"] <= max_retry: + try: + f = urlopen(url) + data = f.read() + rurl = f.geturl() + f.close() + ret.update(json.loads(data)) + ret["_code"] = f.getcode() + if rurl != url: + ret["_url"] = rurl + return ret + except HTTPError as e: + print ("HTTPError {0}".format(e), file=sys.stderr) + ret["_code"] = e.code + ret["_retries"]+=1 + if retry_sleep_time: + sleep(retry_sleep_time) + return ret def loadpadinfo(p): with open(p) as f: diff --git a/etherdump/commands/index.py b/etherdump/commands/index.py index 4d5af65..f2490e4 100644 --- a/etherdump/commands/index.py +++ b/etherdump/commands/index.py @@ -6,6 +6,8 @@ import json, os, re from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from jinja2 import FileSystemLoader, Environment +from datetime import datetime + def group (items, key=lambda x: x): ret = [] @@ -33,28 +35,38 @@ def main(args): tmpath = os.path.join(tmpath, "data", "templates") env = Environment(loader=FileSystemLoader(tmpath)) - template = env.get_template("pad_index.html") + template = env.get_template("index.html") + + def base (x): + return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x) inputs = args.input inputs.sort() - inputs = [x for x in inputs if os.path.isdir(x)] + inputs = group(inputs, base) - def base (x): - return re.sub(r"(\.html)|(\.diff\.html)|(\.meta\.json)|(\.txt)$", "", x) + def loadmeta(paths): + for p in paths: + if p.endswith(".meta.json"): + with open(p) as f: + return json.load(f) + + inputs = map(loadmeta, inputs) + # sort by last edited (reverse) + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8")) # TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA # evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT # - print ("
    ") - for x in inputs: - padid = x - metapath = os.path.join(x, "{0}.meta.json".format(padid)) - if os.path.exists(metapath): - print ("""
  1. {0}
  2. """.format(x)) - with open(metapath) as f: - meta = json.load(f) - indexpath = os.path.join(x, "index.html") - with open(indexpath, "w") as f: - print (template.render(**meta).encode("utf-8"), file=f) - - print ("
") + # print ("
    ") + # for x in inputs: + # padid = x + # metapath = os.path.join(x, "{0}.meta.json".format(padid)) + # if os.path.exists(metapath): + # print ("""
  1. {0}
  2. """.format(x)) + # with open(metapath) as f: + # meta = json.load(f) + # indexpath = os.path.join(x, "index.html") + # with open(indexpath, "w") as f: + + # print ("
") diff --git a/etherdump/commands/pull.py b/etherdump/commands/pull.py index b78df0f..2f9b7fb 100644 --- a/etherdump/commands/pull.py +++ b/etherdump/commands/pull.py @@ -3,7 +3,7 @@ from __future__ import print_function from argparse import ArgumentParser import sys, json, re, os from datetime import datetime -from urllib import urlencode +from urllib import urlencode, quote from urllib2 import HTTPError from common import * from time import sleep @@ -26,7 +26,7 @@ def main (args): p.add_argument("padid", nargs="*", default=[]) p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") - p.add_argument("--pub", default=".", help="folder to store files for public pads, default: pub") + p.add_argument("--pub", default="p", help="folder to store files for public pads, default: pub") p.add_argument("--group", default="g", help="folder to store files for group pads, default: g") p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False") @@ -34,7 +34,7 @@ def main (args): p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False") p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.dhtml, default: False") p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False") - p.add_argument("--folder", default=False, action="store_true", help="dump files to folder named PADID (meta, text, html, dhtml), default: False") + p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False") p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout") p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous") args = p.parse_args(args) @@ -66,10 +66,6 @@ storing enough information to reconstruct (or understand an error occurred) data['padID'] = padid.encode("utf-8") p = padpath(padid, args.pub, args.group) if args.folder: - try: - os.makedirs(p) - except OSError: - pass p = os.path.join(p, padid.encode("utf-8")) metapath = p + ".meta.json" @@ -77,13 +73,14 @@ storing enough information to reconstruct (or understand an error occurred) tries = 1 skip = False padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) + meta = {} if type(padurlbase) == unicode: padurlbase = padurlbase.encode("utf-8") while True: try: if os.path.exists(metapath): with open(metapath) as f: - meta = json.load(f) + meta.update(json.load(f)) revisions = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] if meta['revisions'] == revisions and not args.force: skip=True @@ -91,11 +88,13 @@ storing enough information to reconstruct (or understand an error occurred) ## TODO: OUTPUT TO DIRECTORIES with DATA EMBEDDED IN DOCUMENTS ## (or else in surrounding meta data!!) - meta = {'padid': padid.encode("utf-8")} - # this should be less of a hack - # TODO TEST!!! - - meta["padurl"] = padurlbase + padid.encode("utf-8") + meta['padid'] = padid.encode("utf-8") + versions = meta["versions"] = [] + versions.append({ + "url": padurlbase + padid.encode("utf-8"), + "type": "pad", + "code": 200 + }) if revisions == None: meta['revisions'] = getjson(info['apiurl']+'getRevisionsCount?'+urlencode(data))['data']['revisions'] @@ -137,48 +136,53 @@ storing enough information to reconstruct (or understand an error occurred) except OSError: pass - if args.all or args.meta: - with open(metapath, "w") as f: - json.dump(meta, f, indent=2) - # Process text, html, dhtml, all options - if args.all or args.text: - text = getjson(info['apiurl']+'getText?'+urlencode(data)) - text = text['data']['text'] - with open(p+".txt", "w") as f: - f.write(text.encode("utf-8")) - # once the content is settled, compute a hash - # and link it in the metadata! - - if args.all or args.html: html = getjson(info['apiurl']+'getHTML?'+urlencode(data)) - html = html['data']['html'] - with open(p+".html", "w") as f: - f.write(html.encode("utf-8")) + ver = {"type": "html"} + versions.append(ver) + ver["code"] = html["_code"] + if html["_code"] == 200: + html = html['data']['html'] + ver["path"] = p+".raw.html" + ver["url"] = quote(ver["path"]) + with open(ver["path"], "w") as f: + f.write(html.encode("utf-8")) + + if args.all or args.text: + text = getjson(info['apiurl']+'getText?'+urlencode(data)) + ver = {"type": "text"} + versions.append(ver) + ver["code"] = text["_code"] + if text["_code"] == 200: + text = text['data']['text'] + ver["path"] = p+".raw.txt" + ver["url"] = quote(ver["path"]) + with open(ver["path"], "w") as f: + f.write(text.encode("utf-8")) + # once the content is settled, compute a hash + # and link it in the metadata! if args.all or args.dhtml: - tries = 0 - skip = False - while not skip: - try: - data['startRev'] = "0" - html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data)) - html = html['data']['html'] - with open(p+".diff.html", "w") as f: - f.write(html.encode("utf-8")) - break - except HTTPError as e: - print ("HTTPERROR {0}".format(e), file=sys.stderr) - tries += 1 - if tries >= 5: - print (" Too many errors, deleting .diff.html and skipping", file=sys.stderr) - try: - os.remove(p+".diff.html") - except OSError: - pass - skip=True - else: - sleep(0.1) + data['startRev'] = "0" + html = getjson(info['apiurl']+'createDiffHTML?'+urlencode(data)) + ver = {"type": "diffhtml"} + versions.append(ver) + ver["code"] = html["_code"] + if html["_code"] == 200: + html = html['data']['html'] + ver["path"] = p+".diff.html" + ver["url"] = quote(ver["path"]) + with open(ver["path"], "w") as f: + f.write(html.encode("utf-8")) + + # output meta + if args.all or args.meta: + ver = {"type": "meta"} + versions.append(ver) + ver["path"] = metapath + ver["url"] = quote(metapath) + with open(metapath, "w") as f: + json.dump(meta, f, indent=2) print("\n{0} pad(s) loaded".format(count), file=sys.stderr) diff --git a/etherdump/data/templates/index.html b/etherdump/data/templates/index.html index 85a90bc..f73109f 100644 --- a/etherdump/data/templates/index.html +++ b/etherdump/data/templates/index.html @@ -2,23 +2,39 @@ - {% block title %}{{title}}{% endblock %} + etherdump {% block scripts %} - {% endblock scripts %} -{% block header %}

{{title}}

{% endblock %} -{% block info %}

Last updated {{timestamp}}

{% endblock %} +{% block info %}

This listing is updated automatically once daily. Last update {{timestamp}}.

{% endblock %} {% block namefilter %}
+