From 55fbdea4108e0b6c42d122f58f1f01d55057d98c Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Fri, 15 Jan 2016 14:04:03 +0100 Subject: [PATCH] index now generalized template publishing --- etherdump/commands/index.py | 157 +++++++++++++++++++++++++++++------- etherdump/commands/rss.py | 157 ------------------------------------ 2 files changed, 126 insertions(+), 188 deletions(-) delete mode 100644 etherdump/commands/rss.py diff --git a/etherdump/commands/index.py b/etherdump/commands/index.py index f2490e4..78e7ad3 100644 --- a/etherdump/commands/index.py +++ b/etherdump/commands/index.py @@ -1,13 +1,22 @@ #!/usr/bin/env python - from __future__ import print_function from argparse import ArgumentParser -import json, os, re +import sys, json, re, os, urlparse +from datetime import datetime from urllib import urlencode -from urllib2 import urlopen, HTTPError, URLError +from urllib2 import HTTPError from jinja2 import FileSystemLoader, Environment -from datetime import datetime +from common import * +from time import sleep +import dateutil.parser +""" +index: + Generate pages from etherdumps using a template. + + Built-in templates: rss.xml, index.html + +""" def group (items, key=lambda x: x): ret = [] @@ -22,23 +31,68 @@ def group (items, key=lambda x: x): ret.append(keys[k]) return ret -def main(args): - p = ArgumentParser("") +def base (x): + return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x) + +def excerpt (t, chars=25): + if len(t) > chars: + t = t[:chars] + "..." + return t + +def absurl (url, base=None): + if not url.startswith("http"): + return base + url + return url + +def url_base (url): + (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) + path, _ = os.path.split(path.lstrip("/")) + ret = urlparse.urlunparse((scheme, netloc, path, None, None, None)) + if ret: + ret += "/" + return ret + +def main (args): + p = ArgumentParser("Convert dumped files to a document via a template.") + p.add_argument("input", nargs="+", help="filenames") - p.add_argument("--templates", default=None, help="templates path") + p.add_argument("--templatepath", default=None, help="path to find templates, default: built-in") + p.add_argument("--template", default="index.html", help="template name, built-ins include index.html, rss.xml; default: index.html") + p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") + # p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") + + p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") + p.add_argument("--order", default="lastedited", help="order, possible values: padid, pad (no group name), lastedited (number of) authors, revisions, default: lastedited") + p.add_argument("--reverse", default=False, action="store_true", help="reverse order, default: False (reverse chrono)") + p.add_argument("--limit", type=int, default=0, help="limit to number of items, default: 0 (no limit)") + + p.add_argument("--title", default="etherdump", help="title for document or rss feed channel title, default: etherdump") + p.add_argument("--description", default="", help="channel description, default: empty") + p.add_argument("--language", default="en-US", help="rss: feed language, default: en-US") + p.add_argument("--updatePeriod", default="daily", help="rss: updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily") + p.add_argument("--updateFrequency", default=1, type=int, help="rss: update frequency within the update period (where 2 would mean twice per period); default: 1") + p.add_argument("--siteurl", default=None, help="rss: to use as channel's site link, default: the etherpad url") + p.add_argument("--feedurl", default="feed.xml", help="rss: to use as feeds own (self) link, default: feed.xml") + p.add_argument("--generator", default="https://gitlab.com/activearchives/etherdump", help="generator, default: https://gitlab.com/activearchives/etherdump") + + p.add_argument("--content", default=False, action="store_true", help="rss: include content, default: False") + p.add_argument("--link", default="diffhtml,html,text", help="version to use as link, can be comma-delim list, use first avail, default: diffhtml,html,text") + p.add_argument("--linkbase", default=None, help="base url to use for links, default: try to use the feedurl") + args = p.parse_args(args) - tmpath = args.templates + tmpath = args.templatepath + # Default path for template is the built-in data/templates if tmpath == None: tmpath = os.path.split(os.path.abspath(__file__))[0] tmpath = os.path.split(tmpath)[0] tmpath = os.path.join(tmpath, "data", "templates") env = Environment(loader=FileSystemLoader(tmpath)) - template = env.get_template("index.html") + env.filters["excerpt"] = excerpt + template = env.get_template(args.template) - def base (x): - return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x) + info = loadpadinfo(args.padinfo) inputs = args.input inputs.sort() @@ -50,23 +104,64 @@ def main(args): with open(p) as f: return json.load(f) - inputs = map(loadmeta, inputs) - # sort by last edited (reverse) - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8")) - - # TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA - # evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT - # - # print ("
    ") - # for x in inputs: - # padid = x - # metapath = os.path.join(x, "{0}.meta.json".format(padid)) - # if os.path.exists(metapath): - # print ("""
  1. {0}
  2. """.format(x)) - # with open(metapath) as f: - # meta = json.load(f) - # indexpath = os.path.join(x, "index.html") - # with open(indexpath, "w") as f: - - # print ("
") + def fixdates (padmeta): + d = dateutil.parser.parse(padmeta["lastedited_iso"]) + padmeta["lastedited"] = d + padmeta["lastedited_822"] = d.strftime("%a, %d %b %Y %H:%M:%S +0000") + return padmeta + + pads = map(loadmeta, inputs) + pads = map(fixdates, pads) + args.pads = pads + + # args.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) + if type(padurlbase) == unicode: + padurlbase = padurlbase.encode("utf-8") + args.siteurl = args.siteurl or padurlbase + args.utcnow = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") + + # order items & apply limit + if args.order == "lastedited": + args.pads.sort(key=lambda x: x.get("lastedited_iso"), reverse=args.reverse) + elif args.order == "pad": + args.pads.sort(key=lambda x: x.get("pad"), reverse=args.reverse) + elif args.order == "padid": + args.pads.sort(key=lambda x: x.get("padid"), reverse=args.reverse) + elif args.order == "revisions": + args.pads.sort(key=lambda x: x.get("revisions"), reverse=args.reverse) + elif args.order == "authors": + args.pads.sort(key=lambda x: len(x.get("authors")), reverse=args.reverse) + else: + raise Exception("That ordering is not implemented!") + + if args.limit: + args.pads = args.pads[:args.limit] + + # add versions_by_type, add in full text + # add link (based on args.link) + linkversions = args.link.split(",") + linkbase = args.linkbase or url_base(args.feedurl) + # print ("linkbase", linkbase, args.linkbase, args.feedurl) + + for p in pads: + versions_by_type = {} + p["versions_by_type"] = versions_by_type + for v in p["versions"]: + t = v["type"] + versions_by_type[t] = v + with open (versions_by_type["text"]["path"]) as f: + p["text"] = f.read().decode("utf-8") + + # ADD IN LINK + for v in linkversions: + vdata = versions_by_type[v] + try: + if v == "pad" or os.path.exists(vdata["path"]): + p["link"] = absurl(vdata["url"], linkbase) + break + except KeyError as e: + pass + + print (template.render(vars(args)).encode("utf-8")) diff --git a/etherdump/commands/rss.py b/etherdump/commands/rss.py deleted file mode 100644 index 7ce86bf..0000000 --- a/etherdump/commands/rss.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from argparse import ArgumentParser -import sys, json, re, os, urlparse -from datetime import datetime -from urllib import urlencode -from urllib2 import HTTPError -from jinja2 import FileSystemLoader, Environment -from common import * -from time import sleep -import dateutil.parser - -""" -rss: - Generate an RSS feed from an etherdump. - - -TODO NEXT -add back limit and ordering parameters to create filters to make a latest changes feed! - -""" - -def group (items, key=lambda x: x): - ret = [] - keys = {} - for item in items: - k = key(item) - if k not in keys: - keys[k] = [] - keys[k].append(item) - for k in sorted(keys): - keys[k].sort() - ret.append(keys[k]) - return ret - -def base (x): - return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x) - -def excerpt (t, chars=25): - if len(t) > chars: - t = t[:chars] + "..." - return t - -def absurl (url, base=None): - if not url.startswith("http"): - return base + url - return url - -def url_base (url): - (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) - path, _ = os.path.split(path.lstrip("/")) - ret = urlparse.urlunparse((scheme, netloc, path, None, None, None)) - if ret: - ret += "/" - return ret - -def main (args): - p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)") - - p.add_argument("input", nargs="+", help="filenames") - p.add_argument("--templates", default=None, help="templates path") - - p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") - p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") - - p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") - p.add_argument("--type", default="recentchanges", help="type of feed, default: recentchanges") - - p.add_argument("--limit", type=int, default=10, help="number of items, default: 10") - p.add_argument("--chronological", default=False, action="store_true", help="order chronologically, default: False (reverse chrono)") - - p.add_argument("--title", default="etherpad", help="rss feed channel title, default: etherpad") - p.add_argument("--description", default="", help="channel description, default: empty") - p.add_argument("--language", default="en-US", help="feed language, default: en-US") - p.add_argument("--updatePeriod", default="daily", help="updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily") - p.add_argument("--updateFrequency", default=1, type=int, help="update frequency within the update period (where 2 would mean twice per period); default: 1") - p.add_argument("--siteurl", default=None, help="to use as channel's site link, default: the etherpad url") - p.add_argument("--feedurl", default="feed.xml", help="to use as feeds own (self) link, default: feed.xml") - p.add_argument("--generator", default="https://gitlab.com/activearchives/etherdump", help="generator, default: https://gitlab.com/activearchives/etherdump") - - p.add_argument("--content", default=False, action="store_true", help="include content, default: False") - p.add_argument("--link", default="diffhtml,html,text", help="version to use as link, can be comma-delim list, use first avail, default: diffhtml,html,text") - p.add_argument("--linkbase", default=None, help="base url to use for links, default: try to use the feedurl") - - args = p.parse_args(args) - - tmpath = args.templates - if tmpath == None: - tmpath = os.path.split(os.path.abspath(__file__))[0] - tmpath = os.path.split(tmpath)[0] - tmpath = os.path.join(tmpath, "data", "templates") - - env = Environment(loader=FileSystemLoader(tmpath)) - env.filters["excerpt"] = excerpt - template = env.get_template("rss.xml") - - info = loadpadinfo(args.padinfo) - - inputs = args.input - inputs.sort() - inputs = group(inputs, base) - - def loadmeta(paths): - for p in paths: - if p.endswith(".meta.json"): - with open(p) as f: - return json.load(f) - - def fixdates (padmeta): - d = dateutil.parser.parse(padmeta["lastedited_iso"]) - padmeta["lastedited"] = d - padmeta["lastedited_822"] = d.strftime("%a, %d %b %Y %H:%M:%S +0000") - return padmeta - - pads = map(loadmeta, inputs) - pads = map(fixdates, pads) - args.pads = pads - - # args.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) - if type(padurlbase) == unicode: - padurlbase = padurlbase.encode("utf-8") - args.siteurl = args.siteurl or padurlbase - args.utcnow = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") - - # order items & apply limit - args.pads.sort(key=lambda x: x.get("lastedited_iso"), reverse=not args.chronological) - if args.limit: - args.pads = args.pads[:args.limit] - - # add versions_by_type, add in full text - # add link (based on args.link) - linkversions = args.link.split(",") - linkbase = args.linkbase or url_base(args.feedurl) - # print ("linkbase", linkbase, args.linkbase, args.feedurl) - - for p in pads: - versions_by_type = {} - p["versions_by_type"] = versions_by_type - for v in p["versions"]: - t = v["type"] - versions_by_type[t] = v - with open (versions_by_type["text"]["path"]) as f: - p["text"] = f.read().decode("utf-8") - - # ADD IN LINK - for v in linkversions: - vdata = versions_by_type[v] - try: - if v == "pad" or os.path.exists(vdata["path"]): - p["link"] = absurl(vdata["url"], linkbase) - break - except KeyError as e: - pass - - print (template.render(vars(args)).encode("utf-8"))