from __future__ import print_function
from argparse import ArgumentParser
import sys , json , re , os , time
from datetime import datetime
import dateutil . parser
try :
# python2
from urllib2 import urlopen , URLError , HTTPError
from urllib import urlencode
from urlparse import urlparse , urlunparse
except ImportError :
# python3
from urllib . parse import urlparse , urlunparse , urlencode , quote
from urllib . request import urlopen , URLError , HTTPError
from jinja2 import FileSystemLoader , Environment
from etherdump . commands . common import *
from time import sleep
import dateutil . parser
"""
index :
Generate pages from etherdumps using a template .
Built - in templates : rss . xml , index . html
"""
def group ( items , key = lambda x : x ) :
""" returns a list of lists, of items grouped by a key function """
ret = [ ]
keys = { }
for item in items :
k = key ( item )
if k not in keys :
keys [ k ] = [ ]
keys [ k ] . append ( item )
for k in sorted ( keys ) :
keys [ k ] . sort ( )
ret . append ( keys [ k ] )
return ret
# def base (x):
# return re.sub(r"(\.raw\.html)|(\.diff\.html)|(\.meta\.json)|(\.raw\.txt)$", "", x)
def splitextlong ( x ) :
""" split " long " extensions, i.e. foo.bar.baz => ( ' foo ' , ' .bar.baz ' ) """
m = re . search ( r " ^(.*?)( \ ..*)$ " , x )
if m :
return m . groups ( )
else :
return x , ' '
def base ( x ) :
return splitextlong ( x ) [ 0 ]
def excerpt ( t , chars = 25 ) :
if len ( t ) > chars :
t = t [ : chars ] + " ... "
return t
def absurl ( url , base = None ) :
if not url . startswith ( " http " ) :
return base + url
return url
def url_base ( url ) :
( scheme , netloc , path , params , query , fragment ) = urlparse ( url )
path , _ = os . path . split ( path . lstrip ( " / " ) )
ret = urlunparse ( ( scheme , netloc , path , None , None , None ) )
if ret :
ret + = " / "
return ret
def datetimeformat ( t , format = ' % Y- % m- %d % H: % M: % S ' ) :
if type ( t ) == str :
dt = dateutil . parser . parse ( t )
return dt . strftime ( format )
else :
return time . strftime ( format , time . localtime ( t ) )
def main ( args ) :
p = ArgumentParser ( " Convert dumped files to a document via a template. " )
p . add_argument ( " input " , nargs = " + " , help = " Files to list (.meta.json files) " )
p . add_argument ( " --templatepath " , default = None , help = " path to find templates, default: built-in " )
p . add_argument ( " --template " , default = " index.html " , help = " template name, built-ins include index.html, rss.xml; default: index.html " )
p . add_argument ( " --padinfo " , default = " .etherdump/settings.json " , help = " settings, default: ./.etherdump/settings.json " )
# p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)")
p . add_argument ( " --order " , default = " padid " , help = " order, possible values: padid, pad (no group name), lastedited, (number of) authors, revisions, default: padid " )
p . add_argument ( " --reverse " , default = False , action = " store_true " , help = " reverse order, default: False (reverse chrono) " )
p . add_argument ( " --limit " , type = int , default = 0 , help = " limit to number of items, default: 0 (no limit) " )
p . add_argument ( " --skip " , default = None , type = int , help = " skip this many items, default: None " )
p . add_argument ( " --content " , default = False , action = " store_true " , help = " rss: include (full) content tag, default: False " )
p . add_argument ( " --link " , default = " diffhtml,html,text " , help = " link variable will be to this version, can be comma-delim list, use first avail, default: diffhtml,html,text " )
p . add_argument ( " --linkbase " , default = None , help = " base url to use for links, default: try to use the feedurl " )
p . add_argument ( " --output " , default = None , help = " output, default: stdout " )
pg = p . add_argument_group ( ' template variables ' )
pg . add_argument ( " --feedurl " , default = " feed.xml " , help = " rss: to use as feeds own (self) link, default: feed.xml " )
pg . add_argument ( " --siteurl " , default = None , help = " rss: to use as channel ' s site link, default: the etherpad url " )
pg . add_argument ( " --title " , default = " etherdump " , help = " title for document or rss feed channel title, default: etherdump " )
pg . add_argument ( " --description " , default = " " , help = " rss: channel description, default: empty " )
pg . add_argument ( " --language " , default = " en-US " , help = " rss: feed language, default: en-US " )
pg . add_argument ( " --updatePeriod " , default = " daily " , help = " rss: updatePeriod, possible values: hourly, daily, weekly, monthly, yearly; default: daily " )
pg . add_argument ( " --updateFrequency " , default = 1 , type = int , help = " rss: update frequency within the update period (where 2 would mean twice per period); default: 1 " )
pg . add_argument ( " --generator " , default = " https://gitlab.com/activearchives/etherdump " , help = " generator, default: https://gitlab.com/activearchives/etherdump " )
pg . add_argument ( " --timestamp " , default = None , help = " timestamp, default: now (e.g. 2015-12-01 12:30:00) " )
pg . add_argument ( " --next " , default = None , help = " next link, default: None) " )
pg . add_argument ( " --prev " , default = None , help = " prev link, default: None " )
args = p . parse_args ( args )
tmpath = args . templatepath
# Default path for template is the built-in data/templates
if tmpath == None :
tmpath = os . path . split ( os . path . abspath ( __file__ ) ) [ 0 ]
tmpath = os . path . split ( tmpath ) [ 0 ]
tmpath = os . path . join ( tmpath , " data " , " templates " )
env = Environment ( loader = FileSystemLoader ( tmpath ) )
env . filters [ " excerpt " ] = excerpt
env . filters [ " datetimeformat " ] = datetimeformat
template = env . get_template ( args . template )
info = loadpadinfo ( args . padinfo )
inputs = args . input
inputs . sort ( )
# Use "base" to strip (longest) extensions
# inputs = group(inputs, base)
def wrappath ( p ) :
path = " ./ {0} " . format ( p )
ext = os . path . splitext ( p ) [ 1 ] [ 1 : ]
return {
" url " : path ,
" path " : path ,
" code " : 200 ,
" type " : ext
}
def metaforpaths ( paths ) :
ret = { }
pid = base ( paths [ 0 ] )
ret [ ' pad ' ] = ret [ ' padid ' ] = pid
ret [ ' versions ' ] = [ wrappath ( x ) for x in paths ]
lastedited = None
for p in paths :
mtime = os . stat ( p ) . st_mtime
if lastedited == None or mtime > lastedited :
lastedited = mtime
ret [ " lastedited_iso " ] = datetime . fromtimestamp ( lastedited ) . strftime ( " % Y- % m- %d T % H: % M: % S " )
ret [ " lastedited_raw " ] = mtime
return ret
def loadmeta ( p ) :
# Consider a set of grouped files
# Otherwise, create a "dummy" one that wraps all the files as versions
if p . endswith ( " .meta.json " ) :
with open ( p ) as f :
return json . load ( f )
# # IF there is a .meta.json, load it & MERGE with other files
# if ret:
# # TODO: merge with other files
# for p in paths:
# if "./"+p not in ret['versions']:
# ret['versions'].append(wrappath(p))
# return ret
# else:
# return metaforpaths(paths)
def fixdates ( padmeta ) :
d = dateutil . parser . parse ( padmeta [ " lastedited_iso " ] )
padmeta [ " lastedited " ] = d
padmeta [ " lastedited_822 " ] = d . strftime ( " %a , %d % b % Y % H: % M: % S +0000 " )
return padmeta
pads = map ( loadmeta , inputs )
pads = [ x for x in pads if x != None ]
pads = map ( fixdates , pads )
args . pads = list ( pads )
inputs = args . input
inputs . sort ( )
removelist = [ ]
def has_version ( padinfo , path ) :
return [ x for x in padinfo [ ' versions ' ] if ' path ' in x and x [ ' path ' ] == " ./ " + path ]
pads_by_base = { }
for p in args . pads :
# print ("Trying padid", p['padid'], file=sys.stderr)
padbase = os . path . splitext ( p [ ' padid ' ] ) [ 0 ]
pads_by_base [ padbase ] = p
padbases = list ( pads_by_base . keys ( ) )
# SORT THEM LONGEST FIRST TO ensure that LONGEST MATCHES MATCH
padbases . sort ( key = lambda x : len ( x ) , reverse = True )
# print ("PADBASES", file=sys.stderr)
# for pb in padbases:
# print (" ", pb, file=sys.stderr)
def could_have_base ( x , y ) :
return x == y or ( x . startswith ( y ) and x [ len ( y ) : ] . startswith ( " . " ) )
def get_best_pad ( x ) :
for pb in padbases :
p = pads_by_base [ pb ]
if could_have_base ( x , pb ) :
return p
print ( " pairing input files with pads " , file = sys . stderr )
for x in inputs :
# pair input with a pad if possible
xbasename = os . path . basename ( x )
p = get_best_pad ( xbasename )
if p :
if not has_version ( p , x ) :
print ( " Grouping file {0} with pad {1} " . format ( x , p [ ' padid ' ] ) , file = sys . stderr )
p [ ' versions ' ] . append ( wrappath ( x ) )
else :
print ( " Skipping existing version {0} ( {1} )... " . format ( x , p [ ' padid ' ] ) , file = sys . stderr )
removelist . append ( x )
# Removed Matches files
for x in removelist :
inputs . remove ( x )
print ( " Remaining files: " , file = sys . stderr )
for x in inputs :
print ( x , file = sys . stderr )
print ( file = sys . stderr )
# Add "fake" pads for remaining files
for x in inputs :
args . pads . append ( metaforpaths ( [ x ] ) )
if args . timestamp == None :
args . timestamp = datetime . now ( ) . strftime ( " % Y- % m- %d % H: % M: % S " )
padurlbase = re . sub ( r " api/1.2.9/$ " , " p/ " , info [ " apiurl " ] )
# if type(padurlbase) == unicode:
# padurlbase = padurlbase.encode("utf-8")
args . siteurl = args . siteurl or padurlbase
args . utcnow = datetime . utcnow ( ) . strftime ( " %a , %d % b % Y % H: % M: % S +0000 " )
# order items & apply limit
if args . order == " lastedited " :
args . pads . sort ( key = lambda x : x . get ( " lastedited_iso " ) , reverse = args . reverse )
elif args . order == " pad " :
args . pads . sort ( key = lambda x : x . get ( " pad " ) , reverse = args . reverse )
elif args . order == " padid " :
args . pads . sort ( key = lambda x : x . get ( " padid " ) , reverse = args . reverse )
elif args . order == " revisions " :
args . pads . sort ( key = lambda x : x . get ( " revisions " ) , reverse = args . reverse )
elif args . order == " authors " :
args . pads . sort ( key = lambda x : len ( x . get ( " authors " ) ) , reverse = args . reverse )
else :
raise Exception ( " That ordering is not implemented! " )
if args . limit :
args . pads = args . pads [ : args . limit ]
# add versions_by_type, add in full text
# add link (based on args.link)
linkversions = args . link . split ( " , " )
linkbase = args . linkbase or url_base ( args . feedurl )
# print ("linkbase", linkbase, args.linkbase, args.feedurl)
for p in args . pads :
versions_by_type = { }
p [ " versions_by_type " ] = versions_by_type
for v in p [ " versions " ] :
t = v [ " type " ]
versions_by_type [ t ] = v
if " text " in versions_by_type :
try :
with open ( versions_by_type [ " text " ] [ " path " ] ) as f :
p [ " text " ] = f . read ( )
except FileNotFoundError :
p [ ' text ' ] = ' '
# ADD IN LINK TO PAD AS "link"
for v in linkversions :
if v in versions_by_type :
vdata = versions_by_type [ v ]
try :
if v == " pad " or os . path . exists ( vdata [ " path " ] ) :
p [ " link " ] = absurl ( vdata [ " url " ] , linkbase )
break
except KeyError as e :
pass
if args . output :
with open ( args . output , " w " ) as f :
print ( template . render ( vars ( args ) ) , file = f )
else :
print ( template . render ( vars ( args ) ) )