2015-09-17 18:23:18 +02:00
#!/usr/bin/env python
# License: AGPL
#
2015-09-19 11:43:16 +02:00
#
# todo:
# Capture exceptions... add HTTP status errors (502) to meta!!
# so that an eventual index can show the problematic pages!
# Also: provide links to text only / html versions when diff HTML fails
2015-09-17 18:23:18 +02:00
from __future__ import print_function
from etherdump import DATAPATH
# stdlib
import json , sys , os , re
from argparse import ArgumentParser
from datetime import datetime
from xml . etree import cElementTree as ET
from urllib import urlencode
from urllib2 import urlopen , HTTPError , URLError
2015-09-19 11:43:16 +02:00
from time import sleep
2015-09-17 18:23:18 +02:00
# external dependencies (use pip to install these)
import html5lib , jinja2
def filename_to_padid ( t ) :
t = t . replace ( " _ " , " " )
t = re . sub ( r " \ .html$ " , " " , t )
return t
def normalize_pad_name ( n ) :
if ' ? ' in n :
n = n . split ( ' ? ' , 1 ) [ 0 ]
if ' / ' in n :
n = n . split ( ' / ' , 1 ) [ 0 ]
return n
def urlify ( t , ext = " .html " ) :
return t . replace ( " " , " _ " ) + ext
def linkify ( src , urlify = urlify ) :
collect = [ ]
def s ( m ) :
contents = strip_tags ( m . group ( 1 ) )
contents = normalize_pad_name ( contents )
collect . append ( contents )
link = urlify ( contents )
# link = link.split("?", 1)[0]
return " [[<a class= \" wikilink \" href= \" {0} \" > {1} </a>]] " . format ( link , contents )
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re . sub ( r " \ [ \ [(.+?) \ ] \ ] " , s , src )
return ( src , collect )
def strip_tags ( text ) :
return re . sub ( r " <.*?> " , " " , text )
def set_text_contents ( element , text ) :
""" ok this isn ' t really general, but works for singly wrapped elements """
while len ( element ) == 1 :
element = element [ 0 ]
element . text = text
def text_contents ( element ) :
return ( element . text or ' ' ) + ' ' . join ( [ text_contents ( c ) for c in element ] ) + ( element . tail or ' ' )
def contents ( element , method = " html " ) :
return ( element . text or ' ' ) + ' ' . join ( [ ET . tostring ( c , method = method ) for c in element ] )
def get_parent ( tree , elt ) :
for parent in tree . iter ( ) :
for child in parent :
if child == elt :
return parent
def remove_recursive ( tree , elt ) :
""" Remove element and (any resulting) empty containing elements """
p = get_parent ( tree , elt )
if p :
p . remove ( elt )
if len ( p ) == 0 and ( p . text == None or p . text . strip ( ) == " " ) :
# print ("empty parent", p, file=sys.stderr)
remove_recursive ( tree , p )
def trim_removed_spans ( t ) :
# remove <span class="removed"> and empty parents
for n in t . findall ( " .//span[@class= ' removed ' ] " ) :
remove_recursive ( t , n )
# then strip any leading br's from body
while True :
tag = t . find ( " ./body " ) [ 0 ]
if tag . tag == " br " :
remove_recursive ( t , tag )
else :
break
def get_template_env ( tpath = None ) :
paths = [ ]
if tpath and os . path . isdir ( tpath ) :
paths . append ( tpath )
# paths.append(TEMPLATES_PATH)
loader = jinja2 . FileSystemLoader ( paths )
env = jinja2 . Environment ( loader = loader )
return env
2015-09-19 11:43:16 +02:00
def get_group_info ( gid , info ) :
if ' groups ' in info :
if gid in info [ ' groups ' ] :
return info [ ' groups ' ] [ gid ]
2015-09-17 18:23:18 +02:00
def main ( args ) :
p = ArgumentParser ( """
_ _ _
___ | | _ | | __ ___ _ __ __ | | _ _ _ __ ___ _ __
/ _ \ __ | ' _ \ / _ \ ' __ / _ ` | | | | ' _ ` _ \ | ' _ \
| __ / | _ | | | | __ / | | ( _ | | | _ | | | | | | | | _ ) |
\___ | \__ | _ | | _ | \___ | _ | \__ , _ | \__ , _ | _ | | _ | | _ | . __ /
| _ |
""" )
p . add_argument ( " padid " , default = [ ] , nargs = " * " , help = " the padid(s) to process " )
p . add_argument ( " --padinfo " , default = " padinfo.json " , help = " JSON file with login data for the pad (url, apikey etc), default: padinfo.json " )
p . add_argument ( " --path " , default = " output " , help = " path to save files, default: output " )
p . add_argument ( " --verbose " , default = False , action = " store_true " , help = " flag for verbose output " )
p . add_argument ( " --limit " , type = int , default = None )
p . add_argument ( " --allpads " , default = False , action = " store_true " , help = " flag to process all pads " )
p . add_argument ( " --templatepath " , default = os . path . join ( DATAPATH , " templates " ) , help = " directory with templates (override default files) " )
p . add_argument ( " --colors-template " , default = " pad_colors.html " , help = " pad with authorship colors template name: pad_colors.html " )
p . add_argument ( " --padlink " , default = [ ] , action = " append " , help = " give a pad link pattern, example: ' http \ : \ / \ /10 \ .1 \ .10 \ .1/p/(.*) ' " )
p . add_argument ( " --linksearch " , default = [ ] , action = " append " , help = " specify a link pattern to search for " )
p . add_argument ( " --linkreplace " , default = [ ] , action = " append " , help = " specify a replacement pattern to replace preceding linksearch " )
p . add_argument ( " --showurls " , default = False , action = " store_true " , help = " flag to display API URLs that are used (to stderr) " )
p . add_argument ( " --hidepaths " , default = False , action = " store_true " , help = " flag to not display paths " )
p . add_argument ( " --pretend " , default = False , action = " store_true " , help = " flag to not actually save " )
2015-09-19 11:43:16 +02:00
p . add_argument ( " --linkify " , default = False , action = " store_true " , help = " flag to process [[link]] forms (and follow when --spider is used) " )
p . add_argument ( " --spider " , default = False , action = " store_true " , help = " flag to spider pads (requires --linkify) " )
2015-09-17 18:23:18 +02:00
p . add_argument ( " --add-images " , default = False , action = " store_true " , help = " flag to add image tags " )
2015-09-19 11:43:16 +02:00
p . add_argument ( " --force " , default = False , action = " store_true " , help = " force dump (even if not updated since last dump) " )
p . add_argument ( " --authors-css " , default = None , help = " filename to save collected authorship css (nb: any existing file will be mercilessly overwritten), default: don ' t accumulate css " )
2015-09-17 18:23:18 +02:00
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# p.add_argument("--css", default="styles.css", help="padid of stylesheet")
args = p . parse_args ( args )
with open ( args . padinfo ) as f :
info = json . load ( f )
apiurl = " {0[protocol]} :// {0[hostname]} : {0[port]} {0[apiurl]} {0[apiversion]} / " . format ( info )
# padlinkpats are for mapping internal pad links
# linkpats are any other link replacements, both are regexps
padlinkpats = [ ]
linkpats = [ ] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats . extend ( zip ( args . linksearch , args . linkreplace ) )
if " padlink " in info :
if type ( info [ ' padlink ' ] ) == list :
padlinkpats . extend ( info [ ' padlink ' ] )
else :
padlinkpats . append ( info [ ' padlink ' ] )
padlinkpats . extend ( args . padlink )
env = get_template_env ( args . templatepath )
colors_template = env . get_template ( args . colors_template )
todo = args . padid
done = set ( )
count = 0
data = { }
authors_css_rules = { }
data [ ' apikey ' ] = info [ ' apikey ' ]
if args . allpads :
# push the list of all pad names on to todo
list_url = apiurl + ' listAllPads? ' + urlencode ( data )
if args . showurls :
print ( list_url , file = sys . stderr )
results = json . load ( urlopen ( list_url ) ) [ ' data ' ] [ ' padIDs ' ]
todo . extend ( results )
while len ( todo ) > 0 :
padid = todo [ 0 ]
todo = todo [ 1 : ]
done . add ( padid )
data [ ' padID ' ] = padid . encode ( " utf-8 " )
2015-09-19 11:43:16 +02:00
2015-09-17 18:23:18 +02:00
if args . verbose :
2015-09-19 11:43:16 +02:00
print ( u " PADID \" {0} \" " . format ( padid ) . encode ( " utf-8 " ) , file = sys . stderr )
# g.yIRLMysh0PMsCMHc$
grouppat = re . compile ( ur " ^g \ .( \ w+) \ $(.+)$ " )
m = grouppat . search ( padid )
if m :
group = m . group ( 1 )
ginfo = get_group_info ( group , info )
if not ginfo :
print ( " No info for group ' {0} ' , skipping " . format ( group ) , file = sys . stderr )
continue
padid = m . group ( 2 )
else :
group = None
ginfo = None
2015-09-17 18:23:18 +02:00
if not args . pretend :
try :
2015-09-19 11:43:16 +02:00
if ginfo :
os . makedirs ( os . path . join ( args . path , ginfo [ ' name ' ] ) )
else :
os . makedirs ( args . path )
2015-09-17 18:23:18 +02:00
except OSError :
pass
2015-09-19 11:43:16 +02:00
retry = True
tries = 1
while retry :
retry = False
try :
2015-09-17 18:23:18 +02:00
2015-09-19 11:43:16 +02:00
# _
# _ __ ___ ___| |_ __ _
# | '_ ` _ \ / _ \ __/ _` |
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
2015-09-17 18:23:18 +02:00
2015-09-19 11:43:16 +02:00
meta_url = urlify ( padid , ext = " .json " )
raw_url = urlify ( padid , ext = " .txt " )
colors_url = urlify ( padid , ext = " .html " )
if ginfo :
meta_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , meta_url . encode ( " utf-8 " ) )
raw_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , raw_url . encode ( " utf-8 " ) )
colors_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , colors_url . encode ( " utf-8 " ) )
else :
meta_out = " {0} / {1} " . format ( args . path , meta_url . encode ( " utf-8 " ) )
raw_out = " {0} / {1} " . format ( args . path , raw_url . encode ( " utf-8 " ) )
colors_out = " {0} / {1} " . format ( args . path , colors_url . encode ( " utf-8 " ) )
if not args . pretend :
meta = { }
meta [ ' padid ' ] = padid
revisions_url = apiurl + ' getRevisionsCount? ' + urlencode ( data )
if args . showurls :
print ( revisions_url , file = sys . stderr )
meta [ ' total_revisions ' ] = json . load ( urlopen ( revisions_url ) ) [ ' data ' ] [ ' revisions ' ]
# CHECK REVISIONS (against existing meta)
if meta [ ' total_revisions ' ] == 0 :
if args . verbose :
print ( " pad has no revisions, skipping " , file = sys . stderr )
continue
if os . path . exists ( meta_out ) :
with open ( meta_out ) as f :
old_meta = json . load ( f )
if not args . force and old_meta [ ' total_revisions ' ] == meta [ ' total_revisions ' ] :
2015-09-17 18:23:18 +02:00
if args . verbose :
2015-09-19 11:43:16 +02:00
print ( " skipping (up to date) " , file = sys . stderr )
2015-09-17 18:23:18 +02:00
continue
2015-09-19 11:43:16 +02:00
lastedited_url = apiurl + ' getLastEdited? ' + urlencode ( data )
if args . showurls :
print ( lastedited_url , file = sys . stderr )
lastedited_raw = json . load ( urlopen ( lastedited_url ) ) [ ' data ' ] [ ' lastEdited ' ]
meta [ ' lastedited_raw ' ] = lastedited_raw
meta [ ' lastedited ' ] = datetime . fromtimestamp ( int ( lastedited_raw ) / 1000 ) . isoformat ( )
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
authors_url = apiurl + ' listAuthorsOfPad? ' + urlencode ( data )
if args . showurls :
print ( authors_url , file = sys . stderr )
meta [ ' author_ids ' ] = json . load ( urlopen ( authors_url ) ) [ ' data ' ] [ ' authorIDs ' ]
meta [ ' colors ' ] = colors_url
meta [ ' raw ' ] = raw_url
meta [ ' meta ' ] = meta_url
# defer output to LAST STEP (as confirmation)
# _ __ __ ___ __
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
text_url = apiurl + " getText? " + urlencode ( data )
if args . showurls :
print ( text_url , file = sys . stderr )
if not args . pretend :
rawText = json . load ( urlopen ( text_url ) ) [ ' data ' ] [ ' text ' ]
if rawText . strip ( ) == " " :
2015-09-17 18:23:18 +02:00
if args . verbose :
2015-09-19 11:43:16 +02:00
print ( " empty text, skipping " , file = sys . stderr )
continue
if not args . hidepaths :
print ( raw_out , file = sys . stderr )
with open ( raw_out , " w " ) as f :
f . write ( rawText . encode ( " utf-8 " ) )
# _ _ _
# | |__ | |_ _ __ ___ | |
# | '_ \| __| '_ ` _ \| |
# | | | | |_| | | | | | |
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
if not args . hidepaths :
print ( colors_out , file = sys . stderr )
data [ ' startRev ' ] = " 0 "
colors_url = apiurl + ' createDiffHTML? ' + urlencode ( data )
if args . showurls :
print ( colors_url , file = sys . stderr )
html = json . load ( urlopen ( colors_url ) ) [ ' data ' ] [ ' html ' ]
t = html5lib . parse ( html , namespaceHTMLElements = False )
trim_removed_spans ( t )
html = ET . tostring ( t , method = " html " )
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
if args . linkify :
html , links = linkify ( html )
if args . spider :
for l in links :
if l not in todo and l not in done :
if l . startswith ( " http:// " ) or l . startswith ( " https:// " ) :
if args . verbose :
print ( " Ignoring absolute URL in [[ link ]] form " , file = sys . stderr )
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo . append ( l )
# Stage 2: Process as ElementTree
#
t = html5lib . parse ( html , namespaceHTMLElements = False )
# apply linkpats
for a in t . findall ( " .//a " ) :
href = a . attrib . get ( " href " )
original_href = href
if href :
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats :
if re . search ( pat , href ) != None :
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re . sub ( pat , " \\ 1.html " , href )
padid = filename_to_padid ( href )
set_text_contents ( a , " [[ {0} ]] " . format ( padid ) )
if padid not in todo and padid not in done :
if args . verbose :
print ( " link: {0} " . format ( padid ) , file = sys . stderr )
todo . append ( padid )
# apply linkpats
for s , r in linkpats :
href = re . sub ( s , r , href )
if href != original_href :
old_contents = text_contents ( a )
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href :
if args . verbose :
print ( " Updating href IN TEXT " , file = sys . stderr )
set_text_contents ( a , href )
if original_href != href :
if args . verbose :
print ( " Changed href from {0} to {1} " . format ( original_href , href ) , file = sys . stderr )
a . attrib [ ' href ' ] = href
# SHOWIMAGES : inject img tag for (local) images
if args . add_images :
ext = os . path . splitext ( href ) [ 1 ] . lower ( ) . lstrip ( " . " )
if ext in ( " png " , " gif " , " jpeg " , " jpg " ) :
# ap = _parent(a)
print ( " Adding img ' {0} ' " . format ( href ) , file = sys . stderr )
img = ET . SubElement ( a , " img " )
br = ET . SubElement ( a , " br " )
a . remove ( img ) ; a . insert ( 0 , img )
a . remove ( br ) ; a . insert ( 1 , br )
img . attrib [ ' src ' ] = href
# extract the style tag (with authorship colors)
style = t . find ( " .//style " )
if style != None :
if args . authors_css :
for i in style . text . splitlines ( ) :
if len ( i ) :
selector , rule = i . split ( ' ' , 1 )
authors_css_rules [ selector ] = rule
# replace individual style with a ref to the authors-css
style = ' <link rel= " stylesheet " type= " text/css " href= " {0} " > ' . format ( args . authors_css )
else :
style = ET . tostring ( style , method = " html " )
2015-09-17 18:23:18 +02:00
else :
2015-09-19 11:43:16 +02:00
style = " "
# and extract the contents of the body
html = contents ( t . find ( " .//body " ) )
if not args . pretend :
with open ( colors_out , " w " ) as f :
# f.write(html.encode("utf-8"))
f . write ( colors_template . render (
html = html ,
style = style ,
revision = meta [ ' total_revisions ' ] ,
padid = padid ,
timestamp = datetime . now ( ) ,
meta_url = meta_url ,
raw_url = raw_url ,
colors_url = colors_url ,
lastedited = meta [ ' lastedited ' ]
) . encode ( " utf-8 " ) )
# OUTPUT METADATA (finally)
if not args . hidepaths :
print ( meta_out , file = sys . stderr )
with open ( meta_out , " w " ) as f :
json . dump ( meta , f )
# _
# | | ___ ___ _ __
# | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
# |_|\___/ \___/| .__/
# |_|
count + = 1
if args . limit and count > = args . limit :
break
# except HTTPError as e:
# retry = True
# except TypeError as e:
# print ("TypeError, skipping!", file=sys.stderr)
except Exception as e :
print ( " [ {0} ] Exception: {1} " . format ( tries , e ) , file = sys . stderr )
sleep ( 3 )
retry = True
if retry :
tries + = 1
if tries > 5 :
print ( " GIVING UP " , file = sys . stderr )
retry = False
2015-09-17 18:23:18 +02:00
# Write the unified CSS with authors
if args . authors_css :
2015-09-17 18:30:56 +02:00
authors_css_path = os . path . join ( args . path , args . authors_css )
print ( authors_css_path , file = sys . stderr )
with open ( authors_css_path , ' w ' ) as css :
2015-09-17 18:23:18 +02:00
for selector , rule in sorted ( authors_css_rules . items ( ) ) :
css . write ( selector + ' ' + rule + ' \n ' )