#!/usr/bin/env python from __future__ import print_function from argparse import ArgumentParser import sys, json, re, os, urlparse from datetime import datetime from urllib import urlencode from urllib2 import HTTPError from jinja2 import FileSystemLoader, Environment from common import * from time import sleep import dateutil.parser """ index: Generate pages from etherdumps using a template. Built-in templates: rss.xml, index.html """ def group (items, key=lambda x: x): ret = [] keys = {} for item in items: k = key(item) if k not in keys: keys[k] = [] keys[k].append(item) for k in sorted(keys): keys[k].sort() ret.append(keys[k]) return ret def base (x): return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x) def excerpt (t, chars=25): if len(t) > chars: t = t[:chars] + "..." return t def absurl (url, base=None): if not url.startswith("http"): return base + url return url def url_base (url): (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) path, _ = os.path.split(path.lstrip("/")) ret = urlparse.urlunparse((scheme, netloc, path, None, None, None)) if ret: ret += "/" return ret def main (args): p = ArgumentParser("Convert dumped files to a document via a template.") p.add_argument("input", nargs="+", help="filenames (uses .meta.json files)") p.add_argument("--templatepath", default=None, help="path to find templates, default: built-in") p.add_argument("--template", default="index.html", help="template name, built-ins include index.html, rss.xml; default: index.html") p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: ./.etherdump/settings.json") # p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") p.add_argument("--order", default="padid", help="order, possible values: padid, pad (no group name), lastedited, (number of) authors, revisions, default: padid") p.add_argument("--reverse", default=False, action="store_true", help="reverse order, default: False (reverse chrono)") p.add_argument("--limit", type=int, default=0, help="limit to number of items, default: 0 (no limit)") p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") p.add_argument("--content", default=False, action="store_true", help="rss: include (full) content tag, default: False") p.add_argument("--link", default="diffhtml,html,text", help="link variable will be to this version, can be comma-delim list, use first avail, default: diffhtml,html,text") p.add_argument("--linkbase", default=None, help="base url to use for links, default: try to use the feedurl") p.add_argument("--output", default=None, help="output, default: stdout") pg = p.add_argument_group('template variables') pg.add_argument("--feedurl", default="feed.xml", help="rss: to use as feeds own (self) link, default: feed.xml") pg.add_argument("--siteurl", default=None, help="rss: to use as channel's site link, default: the etherpad url") pg.add_argument("--title", default="etherdump", help="title for document or rss feed channel title, default: etherdump") pg.add_argument("--description", default="", help="rss: channel description, default: empty") pg.add_argument("--language", default="en-US", help="rss: feed language, default: en-US") pg.add_argument("--updatePeriod", default="daily", help="rss: updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily") pg.add_argument("--updateFrequency", default=1, type=int, help="rss: update frequency within the update period (where 2 would mean twice per period); default: 1") pg.add_argument("--generator", default="https://gitlab.com/activearchives/etherdump", help="generator, default: https://gitlab.com/activearchives/etherdump") pg.add_argument("--timestamp", default=None, help="timestamp, default: now (e.g. 2015-12-01 12:30:00)") pg.add_argument("--next", default=None, help="next link, default: None)") pg.add_argument("--prev", default=None, help="prev link, default: None") args = p.parse_args(args) tmpath = args.templatepath # Default path for template is the built-in data/templates if tmpath == None: tmpath = os.path.split(os.path.abspath(__file__))[0] tmpath = os.path.split(tmpath)[0] tmpath = os.path.join(tmpath, "data", "templates") env = Environment(loader=FileSystemLoader(tmpath)) env.filters["excerpt"] = excerpt template = env.get_template(args.template) info = loadpadinfo(args.padinfo) inputs = args.input inputs.sort() inputs = group(inputs, base) def loadmeta(paths): for p in paths: if p.endswith(".meta.json"): with open(p) as f: return json.load(f) def fixdates (padmeta): d = dateutil.parser.parse(padmeta["lastedited_iso"]) padmeta["lastedited"] = d padmeta["lastedited_822"] = d.strftime("%a, %d %b %Y %H:%M:%S +0000") return padmeta pads = map(loadmeta, inputs) pads = map(fixdates, pads) args.pads = pads if args.timestamp == None: args.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) if type(padurlbase) == unicode: padurlbase = padurlbase.encode("utf-8") args.siteurl = args.siteurl or padurlbase args.utcnow = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000") # order items & apply limit if args.order == "lastedited": args.pads.sort(key=lambda x: x.get("lastedited_iso"), reverse=args.reverse) elif args.order == "pad": args.pads.sort(key=lambda x: x.get("pad"), reverse=args.reverse) elif args.order == "padid": args.pads.sort(key=lambda x: x.get("padid"), reverse=args.reverse) elif args.order == "revisions": args.pads.sort(key=lambda x: x.get("revisions"), reverse=args.reverse) elif args.order == "authors": args.pads.sort(key=lambda x: len(x.get("authors")), reverse=args.reverse) else: raise Exception("That ordering is not implemented!") if args.limit: args.pads = args.pads[:args.limit] # add versions_by_type, add in full text # add link (based on args.link) linkversions = args.link.split(",") linkbase = args.linkbase or url_base(args.feedurl) # print ("linkbase", linkbase, args.linkbase, args.feedurl) for p in pads: versions_by_type = {} p["versions_by_type"] = versions_by_type for v in p["versions"]: t = v["type"] versions_by_type[t] = v with open (versions_by_type["text"]["path"]) as f: p["text"] = f.read().decode("utf-8") # ADD IN LINK for v in linkversions: vdata = versions_by_type[v] try: if v == "pad" or os.path.exists(vdata["path"]): p["link"] = absurl(vdata["url"], linkbase) break except KeyError as e: pass if args.output: with open(args.output, "w") as f: print (template.render(vars(args)).encode("utf-8"), file=f) else: print (template.render(vars(args)).encode("utf-8"))