index now generalized template publishing

This commit is contained in:
Michael Murtaugh 2016-01-15 14:04:03 +01:00
parent d4f5aae657
commit 55fbdea410
2 changed files with 125 additions and 187 deletions

View File

@ -1,13 +1,22 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function from __future__ import print_function
from argparse import ArgumentParser from argparse import ArgumentParser
import json, os, re import sys, json, re, os, urlparse
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from jinja2 import FileSystemLoader, Environment
from datetime import datetime from datetime import datetime
from urllib import urlencode
from urllib2 import HTTPError
from jinja2 import FileSystemLoader, Environment
from common import *
from time import sleep
import dateutil.parser
"""
index:
Generate pages from etherdumps using a template.
Built-in templates: rss.xml, index.html
"""
def group (items, key=lambda x: x): def group (items, key=lambda x: x):
ret = [] ret = []
@ -22,23 +31,68 @@ def group (items, key=lambda x: x):
ret.append(keys[k]) ret.append(keys[k])
return ret return ret
def base (x):
return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
def excerpt (t, chars=25):
if len(t) > chars:
t = t[:chars] + "..."
return t
def absurl (url, base=None):
if not url.startswith("http"):
return base + url
return url
def url_base (url):
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
path, _ = os.path.split(path.lstrip("/"))
ret = urlparse.urlunparse((scheme, netloc, path, None, None, None))
if ret:
ret += "/"
return ret
def main (args): def main (args):
p = ArgumentParser("") p = ArgumentParser("Convert dumped files to a document via a template.")
p.add_argument("input", nargs="+", help="filenames") p.add_argument("input", nargs="+", help="filenames")
p.add_argument("--templates", default=None, help="templates path") p.add_argument("--templatepath", default=None, help="path to find templates, default: built-in")
p.add_argument("--template", default="index.html", help="template name, built-ins include index.html, rss.xml; default: index.html")
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
# p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--order", default="lastedited", help="order, possible values: padid, pad (no group name), lastedited (number of) authors, revisions, default: lastedited")
p.add_argument("--reverse", default=False, action="store_true", help="reverse order, default: False (reverse chrono)")
p.add_argument("--limit", type=int, default=0, help="limit to number of items, default: 0 (no limit)")
p.add_argument("--title", default="etherdump", help="title for document or rss feed channel title, default: etherdump")
p.add_argument("--description", default="", help="channel description, default: empty")
p.add_argument("--language", default="en-US", help="rss: feed language, default: en-US")
p.add_argument("--updatePeriod", default="daily", help="rss: updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily")
p.add_argument("--updateFrequency", default=1, type=int, help="rss: update frequency within the update period (where 2 would mean twice per period); default: 1")
p.add_argument("--siteurl", default=None, help="rss: to use as channel's site link, default: the etherpad url")
p.add_argument("--feedurl", default="feed.xml", help="rss: to use as feeds own (self) link, default: feed.xml")
p.add_argument("--generator", default="https://gitlab.com/activearchives/etherdump", help="generator, default: https://gitlab.com/activearchives/etherdump")
p.add_argument("--content", default=False, action="store_true", help="rss: include content, default: False")
p.add_argument("--link", default="diffhtml,html,text", help="version to use as link, can be comma-delim list, use first avail, default: diffhtml,html,text")
p.add_argument("--linkbase", default=None, help="base url to use for links, default: try to use the feedurl")
args = p.parse_args(args) args = p.parse_args(args)
tmpath = args.templates tmpath = args.templatepath
# Default path for template is the built-in data/templates
if tmpath == None: if tmpath == None:
tmpath = os.path.split(os.path.abspath(__file__))[0] tmpath = os.path.split(os.path.abspath(__file__))[0]
tmpath = os.path.split(tmpath)[0] tmpath = os.path.split(tmpath)[0]
tmpath = os.path.join(tmpath, "data", "templates") tmpath = os.path.join(tmpath, "data", "templates")
env = Environment(loader=FileSystemLoader(tmpath)) env = Environment(loader=FileSystemLoader(tmpath))
template = env.get_template("index.html") env.filters["excerpt"] = excerpt
template = env.get_template(args.template)
def base (x): info = loadpadinfo(args.padinfo)
return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
inputs = args.input inputs = args.input
inputs.sort() inputs.sort()
@ -50,23 +104,64 @@ def main(args):
with open(p) as f: with open(p) as f:
return json.load(f) return json.load(f)
inputs = map(loadmeta, inputs) def fixdates (padmeta):
# sort by last edited (reverse) d = dateutil.parser.parse(padmeta["lastedited_iso"])
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") padmeta["lastedited"] = d
print (template.render({"timestamp": timestamp, "pads": inputs}).encode("utf-8")) padmeta["lastedited_822"] = d.strftime("%a, %d %b %Y %H:%M:%S +0000")
return padmeta
# TODO: MODIFY THIS TO MAKE THE OUTPUT JOINABLE with the collected META DATA pads = map(loadmeta, inputs)
# evt: how can the metadata become a GRAPH structure!!! with each output DOCUMENT pads = map(fixdates, pads)
# args.pads = pads
# print ("<ol>")
# for x in inputs:
# padid = x
# metapath = os.path.join(x, "{0}.meta.json".format(padid))
# if os.path.exists(metapath):
# print ("""<li><a href="{0}">{0}</a></li>""".format(x))
# with open(metapath) as f:
# meta = json.load(f)
# indexpath = os.path.join(x, "index.html")
# with open(indexpath, "w") as f:
# print ("</ol>") # args.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
if type(padurlbase) == unicode:
padurlbase = padurlbase.encode("utf-8")
args.siteurl = args.siteurl or padurlbase
args.utcnow = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
# order items & apply limit
if args.order == "lastedited":
args.pads.sort(key=lambda x: x.get("lastedited_iso"), reverse=args.reverse)
elif args.order == "pad":
args.pads.sort(key=lambda x: x.get("pad"), reverse=args.reverse)
elif args.order == "padid":
args.pads.sort(key=lambda x: x.get("padid"), reverse=args.reverse)
elif args.order == "revisions":
args.pads.sort(key=lambda x: x.get("revisions"), reverse=args.reverse)
elif args.order == "authors":
args.pads.sort(key=lambda x: len(x.get("authors")), reverse=args.reverse)
else:
raise Exception("That ordering is not implemented!")
if args.limit:
args.pads = args.pads[:args.limit]
# add versions_by_type, add in full text
# add link (based on args.link)
linkversions = args.link.split(",")
linkbase = args.linkbase or url_base(args.feedurl)
# print ("linkbase", linkbase, args.linkbase, args.feedurl)
for p in pads:
versions_by_type = {}
p["versions_by_type"] = versions_by_type
for v in p["versions"]:
t = v["type"]
versions_by_type[t] = v
with open (versions_by_type["text"]["path"]) as f:
p["text"] = f.read().decode("utf-8")
# ADD IN LINK
for v in linkversions:
vdata = versions_by_type[v]
try:
if v == "pad" or os.path.exists(vdata["path"]):
p["link"] = absurl(vdata["url"], linkbase)
break
except KeyError as e:
pass
print (template.render(vars(args)).encode("utf-8"))

View File

@ -1,157 +0,0 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import sys, json, re, os, urlparse
from datetime import datetime
from urllib import urlencode
from urllib2 import HTTPError
from jinja2 import FileSystemLoader, Environment
from common import *
from time import sleep
import dateutil.parser
"""
rss:
Generate an RSS feed from an etherdump.
TODO NEXT
add back limit and ordering parameters to create filters to make a latest changes feed!
"""
def group (items, key=lambda x: x):
ret = []
keys = {}
for item in items:
k = key(item)
if k not in keys:
keys[k] = []
keys[k].append(item)
for k in sorted(keys):
keys[k].sort()
ret.append(keys[k])
return ret
def base (x):
return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
def excerpt (t, chars=25):
if len(t) > chars:
t = t[:chars] + "..."
return t
def absurl (url, base=None):
if not url.startswith("http"):
return base + url
return url
def url_base (url):
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
path, _ = os.path.split(path.lstrip("/"))
ret = urlparse.urlunparse((scheme, netloc, path, None, None, None))
if ret:
ret += "/"
return ret
def main (args):
p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)")
p.add_argument("input", nargs="+", help="filenames")
p.add_argument("--templates", default=None, help="templates path")
p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json")
p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None")
p.add_argument("--type", default="recentchanges", help="type of feed, default: recentchanges")
p.add_argument("--limit", type=int, default=10, help="number of items, default: 10")
p.add_argument("--chronological", default=False, action="store_true", help="order chronologically, default: False (reverse chrono)")
p.add_argument("--title", default="etherpad", help="rss feed channel title, default: etherpad")
p.add_argument("--description", default="", help="channel description, default: empty")
p.add_argument("--language", default="en-US", help="feed language, default: en-US")
p.add_argument("--updatePeriod", default="daily", help="updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily")
p.add_argument("--updateFrequency", default=1, type=int, help="update frequency within the update period (where 2 would mean twice per period); default: 1")
p.add_argument("--siteurl", default=None, help="to use as channel's site link, default: the etherpad url")
p.add_argument("--feedurl", default="feed.xml", help="to use as feeds own (self) link, default: feed.xml")
p.add_argument("--generator", default="https://gitlab.com/activearchives/etherdump", help="generator, default: https://gitlab.com/activearchives/etherdump")
p.add_argument("--content", default=False, action="store_true", help="include content, default: False")
p.add_argument("--link", default="diffhtml,html,text", help="version to use as link, can be comma-delim list, use first avail, default: diffhtml,html,text")
p.add_argument("--linkbase", default=None, help="base url to use for links, default: try to use the feedurl")
args = p.parse_args(args)
tmpath = args.templates
if tmpath == None:
tmpath = os.path.split(os.path.abspath(__file__))[0]
tmpath = os.path.split(tmpath)[0]
tmpath = os.path.join(tmpath, "data", "templates")
env = Environment(loader=FileSystemLoader(tmpath))
env.filters["excerpt"] = excerpt
template = env.get_template("rss.xml")
info = loadpadinfo(args.padinfo)
inputs = args.input
inputs.sort()
inputs = group(inputs, base)
def loadmeta(paths):
for p in paths:
if p.endswith(".meta.json"):
with open(p) as f:
return json.load(f)
def fixdates (padmeta):
d = dateutil.parser.parse(padmeta["lastedited_iso"])
padmeta["lastedited"] = d
padmeta["lastedited_822"] = d.strftime("%a, %d %b %Y %H:%M:%S +0000")
return padmeta
pads = map(loadmeta, inputs)
pads = map(fixdates, pads)
args.pads = pads
# args.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
if type(padurlbase) == unicode:
padurlbase = padurlbase.encode("utf-8")
args.siteurl = args.siteurl or padurlbase
args.utcnow = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S +0000")
# order items & apply limit
args.pads.sort(key=lambda x: x.get("lastedited_iso"), reverse=not args.chronological)
if args.limit:
args.pads = args.pads[:args.limit]
# add versions_by_type, add in full text
# add link (based on args.link)
linkversions = args.link.split(",")
linkbase = args.linkbase or url_base(args.feedurl)
# print ("linkbase", linkbase, args.linkbase, args.feedurl)
for p in pads:
versions_by_type = {}
p["versions_by_type"] = versions_by_type
for v in p["versions"]:
t = v["type"]
versions_by_type[t] = v
with open (versions_by_type["text"]["path"]) as f:
p["text"] = f.read().decode("utf-8")
# ADD IN LINK
for v in linkversions:
vdata = versions_by_type[v]
try:
if v == "pad" or os.path.exists(vdata["path"]):
p["link"] = absurl(vdata["url"], linkbase)
break
except KeyError as e:
pass
print (template.render(vars(args)).encode("utf-8"))