@ -1,6 +1,11 @@
#!/usr/bin/env python
#!/usr/bin/env python
# License: AGPL
# License: AGPL
#
#
#
# todo:
# Capture exceptions... add HTTP status errors (502) to meta!!
# so that an eventual index can show the problematic pages!
# Also: provide links to text only / html versions when diff HTML fails
from __future__ import print_function
from __future__ import print_function
from etherdump import DATAPATH
from etherdump import DATAPATH
@ -12,6 +17,7 @@ from datetime import datetime
from xml . etree import cElementTree as ET
from xml . etree import cElementTree as ET
from urllib import urlencode
from urllib import urlencode
from urllib2 import urlopen , HTTPError , URLError
from urllib2 import urlopen , HTTPError , URLError
from time import sleep
# external dependencies (use pip to install these)
# external dependencies (use pip to install these)
import html5lib , jinja2
import html5lib , jinja2
@ -102,6 +108,11 @@ def get_template_env (tpath=None):
env = jinja2 . Environment ( loader = loader )
env = jinja2 . Environment ( loader = loader )
return env
return env
def get_group_info ( gid , info ) :
if ' groups ' in info :
if gid in info [ ' groups ' ] :
return info [ ' groups ' ] [ gid ]
def main ( args ) :
def main ( args ) :
p = ArgumentParser ( """
p = ArgumentParser ( """
_ _ _
_ _ _
@ -117,7 +128,6 @@ def main(args):
p . add_argument ( " --verbose " , default = False , action = " store_true " , help = " flag for verbose output " )
p . add_argument ( " --verbose " , default = False , action = " store_true " , help = " flag for verbose output " )
p . add_argument ( " --limit " , type = int , default = None )
p . add_argument ( " --limit " , type = int , default = None )
p . add_argument ( " --allpads " , default = False , action = " store_true " , help = " flag to process all pads " )
p . add_argument ( " --allpads " , default = False , action = " store_true " , help = " flag to process all pads " )
p . add_argument ( " --spider " , default = False , action = " store_true " , help = " flag to spider pads " )
p . add_argument ( " --templatepath " , default = os . path . join ( DATAPATH , " templates " ) , help = " directory with templates (override default files) " )
p . add_argument ( " --templatepath " , default = os . path . join ( DATAPATH , " templates " ) , help = " directory with templates (override default files) " )
p . add_argument ( " --colors-template " , default = " pad_colors.html " , help = " pad with authorship colors template name: pad_colors.html " )
p . add_argument ( " --colors-template " , default = " pad_colors.html " , help = " pad with authorship colors template name: pad_colors.html " )
p . add_argument ( " --padlink " , default = [ ] , action = " append " , help = " give a pad link pattern, example: ' http \ : \ / \ /10 \ .1 \ .10 \ .1/p/(.*) ' " )
p . add_argument ( " --padlink " , default = [ ] , action = " append " , help = " give a pad link pattern, example: ' http \ : \ / \ /10 \ .1 \ .10 \ .1/p/(.*) ' " )
@ -126,8 +136,11 @@ def main(args):
p . add_argument ( " --showurls " , default = False , action = " store_true " , help = " flag to display API URLs that are used (to stderr) " )
p . add_argument ( " --showurls " , default = False , action = " store_true " , help = " flag to display API URLs that are used (to stderr) " )
p . add_argument ( " --hidepaths " , default = False , action = " store_true " , help = " flag to not display paths " )
p . add_argument ( " --hidepaths " , default = False , action = " store_true " , help = " flag to not display paths " )
p . add_argument ( " --pretend " , default = False , action = " store_true " , help = " flag to not actually save " )
p . add_argument ( " --pretend " , default = False , action = " store_true " , help = " flag to not actually save " )
p . add_argument ( " --linkify " , default = False , action = " store_true " , help = " flag to process [[link]] forms (and follow when --spider is used) " )
p . add_argument ( " --spider " , default = False , action = " store_true " , help = " flag to spider pads (requires --linkify) " )
p . add_argument ( " --add-images " , default = False , action = " store_true " , help = " flag to add image tags " )
p . add_argument ( " --add-images " , default = False , action = " store_true " , help = " flag to add image tags " )
p . add_argument ( " --authors-css " , default = " authors.css " , help = " filename to save collected authorship css (nb: etherdump will overwrite this file!) " )
p . add_argument ( " --force " , default = False , action = " store_true " , help = " force dump (even if not updated since last dump) " )
p . add_argument ( " --authors-css " , default = None , help = " filename to save collected authorship css (nb: any existing file will be mercilessly overwritten), default: don ' t accumulate css " )
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# p.add_argument("--css", default="styles.css", help="padid of stylesheet")
# p.add_argument("--css", default="styles.css", help="padid of stylesheet")
@ -175,205 +188,270 @@ def main(args):
done . add ( padid )
done . add ( padid )
data [ ' padID ' ] = padid . encode ( " utf-8 " )
data [ ' padID ' ] = padid . encode ( " utf-8 " )
if args . verbose :
if args . verbose :
print ( " PADID \" {0} \" " . format ( padid ) . encode ( " utf-8 " ) , file = sys . stderr )
print ( u " PADID \" {0} \" " . format ( padid ) . encode ( " utf-8 " ) , file = sys . stderr )
# g.yIRLMysh0PMsCMHc$
grouppat = re . compile ( ur " ^g \ .( \ w+) \ $(.+)$ " )
m = grouppat . search ( padid )
if m :
group = m . group ( 1 )
ginfo = get_group_info ( group , info )
if not ginfo :
print ( " No info for group ' {0} ' , skipping " . format ( group ) , file = sys . stderr )
continue
padid = m . group ( 2 )
else :
group = None
ginfo = None
if not args . pretend :
if not args . pretend :
try :
try :
os . makedirs ( args . path )
if ginfo :
os . makedirs ( os . path . join ( args . path , ginfo [ ' name ' ] ) )
else :
os . makedirs ( args . path )
except OSError :
except OSError :
pass
pass
try :
retry = True
tries = 1
# _
while retry :
# _ __ ___ ___| |_ __ _
retry = False
# | '_ ` _ \ / _ \ __/ _` |
try :
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_url = urlify ( padid , ext = " .json " )
meta_out = " {0} / {1} " . format ( args . path , meta_url . encode ( " utf-8 " ) )
raw_url = urlify ( padid , ext = " .txt " )
raw_out = " {0} / {1} " . format ( args . path , raw_url . encode ( " utf-8 " ) )
colors_url = urlify ( padid , ext = " .html " )
colors_out = " {0} / {1} " . format ( args . path , colors_url . encode ( " utf-8 " ) )
if not args . hidepaths :
print ( meta_out , file = sys . stderr )
if not args . pretend :
meta = { }
meta [ ' padid ' ] = padid
revisions_url = apiurl + ' getRevisionsCount? ' + urlencode ( data )
if args . showurls :
print ( revisions_url , file = sys . stderr )
meta [ ' total_revisions ' ] = json . load ( urlopen ( revisions_url ) ) [ ' data ' ] [ ' revisions ' ]
lastedited_url = apiurl + ' getLastEdited? ' + urlencode ( data )
# _
if args . showurls :
# _ __ ___ ___| |_ __ _
print ( lastedited_url , file = sys . stderr )
# | '_ ` _ \ / _ \ __/ _` |
lastedited_raw = json . load ( urlopen ( lastedited_url ) ) [ ' data ' ] [ ' lastEdited ' ]
# | | | | | | __/ || (_| |
meta [ ' lastedited_raw ' ] = lastedited_raw
# |_| |_| |_|\___|\__\__,_|
meta [ ' lastedited ' ] = datetime . fromtimestamp ( int ( lastedited_raw ) / 1000 ) . isoformat ( )
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
meta_url = urlify ( padid , ext = " .json " )
authors_url = apiurl + ' listAuthorsOfPad? ' + urlencode ( data )
raw_url = urlify ( padid , ext = " .txt " )
if args . showurls :
colors_url = urlify ( padid , ext = " .html " )
print ( authors_url , file = sys . stderr )
meta [ ' author_ids ' ] = json . load ( urlopen ( authors_url ) ) [ ' data ' ] [ ' authorIDs ' ]
if ginfo :
meta [ ' colors ' ] = colors_url
meta_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , meta_url . encode ( " utf-8 " ) )
meta [ ' raw ' ] = raw_url
raw_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , raw_url . encode ( " utf-8 " ) )
meta [ ' meta ' ] = meta_url
colors_out = " {0} / {1} / {2} " . format ( args . path , ginfo [ ' name ' ] , colors_url . encode ( " utf-8 " ) )
with open ( meta_out , " w " ) as f :
else :
json . dump ( meta , f )
meta_out = " {0} / {1} " . format ( args . path , meta_url . encode ( " utf-8 " ) )
raw_out = " {0} / {1} " . format ( args . path , raw_url . encode ( " utf-8 " ) )
# _ __ __ ___ __
colors_out = " {0} / {1} " . format ( args . path , colors_url . encode ( " utf-8 " ) )
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
if not args . pretend :
# |_| \__,_| \_/\_/
meta = { }
meta [ ' padid ' ] = padid
if not args . hidepaths :
revisions_url = apiurl + ' getRevisionsCount? ' + urlencode ( data )
print ( raw_out , file = sys . stderr )
if args . showurls :
text_url = apiurl + " getText? " + urlencode ( data )
print ( revisions_url , file = sys . stderr )
if args . showurls :
meta [ ' total_revisions ' ] = json . load ( urlopen ( revisions_url ) ) [ ' data ' ] [ ' revisions ' ]
print ( text_url , file = sys . stderr )
if not args . pretend :
# CHECK REVISIONS (against existing meta)
rawText = json . load ( urlopen ( text_url ) ) [ ' data ' ] [ ' text ' ]
if meta [ ' total_revisions ' ] == 0 :
with open ( raw_out , " w " ) as f :
if args . verbose :
f . write ( rawText . encode ( " utf-8 " ) )
print ( " pad has no revisions, skipping " , file = sys . stderr )
continue
# _ _ _
if os . path . exists ( meta_out ) :
# | |__ | |_ _ __ ___ | |
with open ( meta_out ) as f :
# | '_ \| __| '_ ` _ \| |
old_meta = json . load ( f )
# | | | | |_| | | | | | |
if not args . force and old_meta [ ' total_revisions ' ] == meta [ ' total_revisions ' ] :
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
if not args . hidepaths :
print ( colors_out , file = sys . stderr )
data [ ' startRev ' ] = " 0 "
colors_url = apiurl + ' createDiffHTML? ' + urlencode ( data )
if args . showurls :
print ( colors_url , file = sys . stderr )
html = json . load ( urlopen ( colors_url ) ) [ ' data ' ] [ ' html ' ]
t = html5lib . parse ( html , namespaceHTMLElements = False )
trim_removed_spans ( t )
html = ET . tostring ( t , method = " html " )
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
html , links = linkify ( html )
if args . spider :
for l in links :
if l not in todo and l not in done :
if l . startswith ( " http:// " ) or l . startswith ( " https:// " ) :
if args . verbose :
if args . verbose :
print ( " Ignoring absolute URL in [[ link ]] form " , file = sys . stderr )
print ( " skipping (up to date) " , file = sys . stderr )
continue
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
lastedited_url = apiurl + ' getLastEdited? ' + urlencode ( data )
todo . append ( l )
if args . showurls :
print ( lastedited_url , file = sys . stderr )
# Stage 2: Process as ElementTree
lastedited_raw = json . load ( urlopen ( lastedited_url ) ) [ ' data ' ] [ ' lastEdited ' ]
#
meta [ ' lastedited_raw ' ] = lastedited_raw
t = html5lib . parse ( html , namespaceHTMLElements = False )
meta [ ' lastedited ' ] = datetime . fromtimestamp ( int ( lastedited_raw ) / 1000 ) . isoformat ( )
# apply linkpats
for a in t . findall ( " .//a " ) :
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
href = a . attrib . get ( " href " )
authors_url = apiurl + ' listAuthorsOfPad? ' + urlencode ( data )
original_href = href
if args . showurls :
if href :
print ( authors_url , file = sys . stderr )
# if args.verbose:
meta [ ' author_ids ' ] = json . load ( urlopen ( authors_url ) ) [ ' data ' ] [ ' authorIDs ' ]
# print ("searching for PADLINK: {0}".format(href))
meta [ ' colors ' ] = colors_url
for pat in padlinkpats :
meta [ ' raw ' ] = raw_url
if re . search ( pat , href ) != None :
meta [ ' meta ' ] = meta_url
# if args.verbose:
# defer output to LAST STEP (as confirmation)
# print (" found PADLINK: {0}".format(href))
href = re . sub ( pat , " \\ 1.html " , href )
# _ __ __ ___ __
padid = filename_to_padid ( href )
# | '__/ _` \ \ /\ / /
set_text_contents ( a , " [[ {0} ]] " . format ( padid ) )
# | | | (_| |\ V V /
if padid not in todo and padid not in done :
# |_| \__,_| \_/\_/
if args . verbose :
print ( " link: {0} " . format ( padid ) , file = sys . stderr )
text_url = apiurl + " getText? " + urlencode ( data )
todo . append ( padid )
if args . showurls :
# apply linkpats
print ( text_url , file = sys . stderr )
for s , r in linkpats :
if not args . pretend :
href = re . sub ( s , r , href )
rawText = json . load ( urlopen ( text_url ) ) [ ' data ' ] [ ' text ' ]
if href != original_href :
if rawText . strip ( ) == " " :
old_contents = text_contents ( a )
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href :
if args . verbose :
print ( " Updating href IN TEXT " , file = sys . stderr )
set_text_contents ( a , href )
if original_href != href :
if args . verbose :
if args . verbose :
print ( " Changed href from {0} to {1} " . format ( original_href , href ) , file = sys . stderr )
print ( " empty text, skipping " , file = sys . stderr )
a . attrib [ ' href ' ] = href
continue
if not args . hidepaths :
# SHOWIMAGES : inject img tag for (local) images
print ( raw_out , file = sys . stderr )
if args . add_images :
with open ( raw_out , " w " ) as f :
ext = os . path . splitext ( href ) [ 1 ] . lower ( ) . lstrip ( " . " )
f . write ( rawText . encode ( " utf-8 " ) )
if ext in ( " png " , " gif " , " jpeg " , " jpg " ) :
# ap = _parent(a)
# _ _ _
print ( " Adding img ' {0} ' " . format ( href ) , file = sys . stderr )
# | |__ | |_ _ __ ___ | |
img = ET . SubElement ( a , " img " )
# | '_ \| __| '_ ` _ \| |
br = ET . SubElement ( a , " br " )
# | | | | |_| | | | | | |
a . remove ( img ) ; a . insert ( 0 , img )
# |_| |_|\__|_| |_| |_|_|
a . remove ( br ) ; a . insert ( 1 , br )
img . attrib [ ' src ' ] = href
# todo ? -- regular HTML output
# extract the style tag (with authorship colors)
# _
style = t . find ( " .//style " )
# ___ ___ | | ___ _ __ ___
if style != None :
# / __/ _ \| |/ _ \| '__/ __|
if args . authors_css :
# | (_| (_) | | (_) | | \__ \
for i in style . text . splitlines ( ) :
# \___\___/|_|\___/|_| |___/
if len ( i ) :
selector , rule = i . split ( ' ' , 1 )
if not args . hidepaths :
authors_css_rules [ selector ] = rule
print ( colors_out , file = sys . stderr )
style = ' ' # strip the individual style tag from each page (only exports to authors-css file)
data [ ' startRev ' ] = " 0 "
# nb: it's up to the template to refer to the authors-css file
colors_url = apiurl + ' createDiffHTML? ' + urlencode ( data )
if args . showurls :
print ( colors_url , file = sys . stderr )
html = json . load ( urlopen ( colors_url ) ) [ ' data ' ] [ ' html ' ]
t = html5lib . parse ( html , namespaceHTMLElements = False )
trim_removed_spans ( t )
html = ET . tostring ( t , method = " html " )
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
if args . linkify :
html , links = linkify ( html )
if args . spider :
for l in links :
if l not in todo and l not in done :
if l . startswith ( " http:// " ) or l . startswith ( " https:// " ) :
if args . verbose :
print ( " Ignoring absolute URL in [[ link ]] form " , file = sys . stderr )
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo . append ( l )
# Stage 2: Process as ElementTree
#
t = html5lib . parse ( html , namespaceHTMLElements = False )
# apply linkpats
for a in t . findall ( " .//a " ) :
href = a . attrib . get ( " href " )
original_href = href
if href :
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats :
if re . search ( pat , href ) != None :
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re . sub ( pat , " \\ 1.html " , href )
padid = filename_to_padid ( href )
set_text_contents ( a , " [[ {0} ]] " . format ( padid ) )
if padid not in todo and padid not in done :
if args . verbose :
print ( " link: {0} " . format ( padid ) , file = sys . stderr )
todo . append ( padid )
# apply linkpats
for s , r in linkpats :
href = re . sub ( s , r , href )
if href != original_href :
old_contents = text_contents ( a )
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href :
if args . verbose :
print ( " Updating href IN TEXT " , file = sys . stderr )
set_text_contents ( a , href )
if original_href != href :
if args . verbose :
print ( " Changed href from {0} to {1} " . format ( original_href , href ) , file = sys . stderr )
a . attrib [ ' href ' ] = href
# SHOWIMAGES : inject img tag for (local) images
if args . add_images :
ext = os . path . splitext ( href ) [ 1 ] . lower ( ) . lstrip ( " . " )
if ext in ( " png " , " gif " , " jpeg " , " jpg " ) :
# ap = _parent(a)
print ( " Adding img ' {0} ' " . format ( href ) , file = sys . stderr )
img = ET . SubElement ( a , " img " )
br = ET . SubElement ( a , " br " )
a . remove ( img ) ; a . insert ( 0 , img )
a . remove ( br ) ; a . insert ( 1 , br )
img . attrib [ ' src ' ] = href
# extract the style tag (with authorship colors)
style = t . find ( " .//style " )
if style != None :
if args . authors_css :
for i in style . text . splitlines ( ) :
if len ( i ) :
selector , rule = i . split ( ' ' , 1 )
authors_css_rules [ selector ] = rule
# replace individual style with a ref to the authors-css
style = ' <link rel= " stylesheet " type= " text/css " href= " {0} " > ' . format ( args . authors_css )
else :
style = ET . tostring ( style , method = " html " )
else :
else :
style = ET . tostring ( style , method = " html " )
style = " "
else :
# and extract the contents of the body
style = " "
html = contents ( t . find ( " .//body " ) )
# and extract the contents of the body
html = contents ( t . find ( " .//body " ) )
if not args . pretend :
with open ( colors_out , " w " ) as f :
if not args . pretend :
# f.write(html.encode("utf-8"))
with open ( colors_out , " w " ) as f :
f . write ( colors_template . render (
# f.write(html.encode("utf-8"))
html = html ,
f . write ( colors_template . render (
style = style ,
html = html ,
revision = meta [ ' total_revisions ' ] ,
style = style ,
padid = padid ,
revision = meta [ ' total_revisions ' ] ,
timestamp = datetime . now ( ) ,
padid = padid ,
meta_url = meta_url ,
timestamp = datetime . now ( ) ,
raw_url = raw_url ,
meta_url = meta_url ,
colors_url = colors_url ,
raw_url = raw_url ,
lastedited = meta [ ' lastedited ' ]
colors_url = colors_url ,
) . encode ( " utf-8 " ) )
lastedited = meta [ ' lastedited ' ]
) . encode ( " utf-8 " ) )
# OUTPUT METADATA (finally)
if not args . hidepaths :
# _
print ( meta_out , file = sys . stderr )
# | | ___ ___ _ __
with open ( meta_out , " w " ) as f :
# | |/ _ \ / _ \| '_ \
json . dump ( meta , f )
# | | (_) | (_) | |_) |
# _
# |_|\___/ \___/| .__/
# | | ___ ___ _ __
# |_|
# | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
count + = 1
# |_|\___/ \___/| .__/
if args . limit and count > = args . limit :
# |_|
break
except TypeError :
count + = 1
print ( " ERROR, skipping! " , file = sys . stderr )
if args . limit and count > = args . limit :
break
# except HTTPError as e:
# retry = True
# except TypeError as e:
# print ("TypeError, skipping!", file=sys.stderr)
except Exception as e :
print ( " [ {0} ] Exception: {1} " . format ( tries , e ) , file = sys . stderr )
sleep ( 3 )
retry = True
if retry :
tries + = 1
if tries > 5 :
print ( " GIVING UP " , file = sys . stderr )
retry = False
# Write the unified CSS with authors
# Write the unified CSS with authors
if args . authors_css :
if args . authors_css :