#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import sys , json , re , os
from datetime import datetime
from urllib import urlencode , quote
from urllib2 import HTTPError
from common import *
from time import sleep
from html5tidy import html5tidy
import html5lib
from xml . etree import ElementTree as ET
# debugging
# import ElementTree as ET
"""
pull ( meta ) :
Update meta data files for those that have changed .
Check for changed pads by looking at revisions & comparing to existing
todo . . .
use / prefer public interfaces ? ( export functions )
"""
def try_deleting ( files ) :
for f in files :
try :
os . remove ( f )
except OSError as e :
pass
def main ( args ) :
p = ArgumentParser ( " Check for pads that have changed since last sync (according to .meta.json) " )
p . add_argument ( " padid " , nargs = " * " , default = [ ] )
p . add_argument ( " --padinfo " , default = " .etherdump/settings.json " , help = " settings, default: .etherdump/settings.json " )
p . add_argument ( " --zerorevs " , default = False , action = " store_true " , help = " include pads with zero revisions, default: False (i.e. pads with no revisions are skipped) " )
p . add_argument ( " --pub " , default = " p " , help = " folder to store files for public pads, default: p " )
p . add_argument ( " --group " , default = " g " , help = " folder to store files for group pads, default: g " )
p . add_argument ( " --skip " , default = None , type = int , help = " skip this many items, default: None " )
p . add_argument ( " --meta " , default = False , action = " store_true " , help = " download meta to PADID.meta.json, default: False " )
p . add_argument ( " --text " , default = False , action = " store_true " , help = " download text to PADID.txt, default: False " )
p . add_argument ( " --html " , default = False , action = " store_true " , help = " download html to PADID.html, default: False " )
p . add_argument ( " --dhtml " , default = False , action = " store_true " , help = " download dhtml to PADID.dhtml, default: False " )
p . add_argument ( " --all " , default = False , action = " store_true " , help = " download all files (meta, text, html, dhtml), default: False " )
p . add_argument ( " --folder " , default = False , action = " store_true " , help = " dump files in a folder named PADID (meta, text, html, dhtml), default: False " )
p . add_argument ( " --output " , default = False , action = " store_true " , help = " output changed padids on stdout " )
p . add_argument ( " --force " , default = False , action = " store_true " , help = " reload, even if revisions count matches previous " )
p . add_argument ( " --nopublish " , default = " __NOPUBLISH__ " , help = " no publish magic word, default: __NOPUBLISH__ " )
args = p . parse_args ( args )
info = loadpadinfo ( args . padinfo )
data = { }
data [ ' apikey ' ] = info [ ' apikey ' ]
if args . padid :
padids = args . padid
else :
padids = getjson ( info [ ' apiurl ' ] + ' listAllPads? ' + urlencode ( data ) ) [ ' data ' ] [ ' padIDs ' ]
padids . sort ( )
numpads = len ( padids )
# maxmsglen = 0
count = 0
for i , padid in enumerate ( padids ) :
if args . skip != None and i < args . skip :
continue
progressbar ( i , numpads , padid )
data [ ' padID ' ] = padid . encode ( " utf-8 " )
p = padpath ( padid , args . pub , args . group )
if args . folder :
p = os . path . join ( p , padid . encode ( " utf-8 " ) )
metapath = p + " .meta.json "
revisions = None
tries = 1
skip = False
padurlbase = re . sub ( r " api/1.2.9/$ " , " p/ " , info [ " apiurl " ] )
meta = { }
if type ( padurlbase ) == unicode :
padurlbase = padurlbase . encode ( " utf-8 " )
while True :
try :
if os . path . exists ( metapath ) :
with open ( metapath ) as f :
meta . update ( json . load ( f ) )
revisions = getjson ( info [ ' apiurl ' ] + ' getRevisionsCount? ' + urlencode ( data ) ) [ ' data ' ] [ ' revisions ' ]
if meta [ ' revisions ' ] == revisions and not args . force :
skip = True
break
meta [ ' padid ' ] = padid . encode ( " utf-8 " )
versions = meta [ " versions " ] = [ ]
versions . append ( {
" url " : padurlbase + quote ( padid . encode ( " utf-8 " ) ) , # this quote was really important for dealing with rogue chars like \xa0 in a padid;
" type " : " pad " ,
" code " : 200
} )
if revisions == None :
meta [ ' revisions ' ] = getjson ( info [ ' apiurl ' ] + ' getRevisionsCount? ' + urlencode ( data ) ) [ ' data ' ] [ ' revisions ' ]
else :
meta [ ' revisions ' ] = revisions
if ( meta [ ' revisions ' ] == 0 ) and ( not args . zerorevs ) :
# print("Skipping zero revs", file=sys.stderr)
skip = True
break
# todo: load more metadata!
meta [ ' group ' ] , meta [ ' pad ' ] = splitpadname ( padid )
meta [ ' pathbase ' ] = p
meta [ ' lastedited_raw ' ] = int ( getjson ( info [ ' apiurl ' ] + ' getLastEdited? ' + urlencode ( data ) ) [ ' data ' ] [ ' lastEdited ' ] )
meta [ ' lastedited_iso ' ] = datetime . fromtimestamp ( int ( meta [ ' lastedited_raw ' ] ) / 1000 ) . isoformat ( )
meta [ ' author_ids ' ] = getjson ( info [ ' apiurl ' ] + ' listAuthorsOfPad? ' + urlencode ( data ) ) [ ' data ' ] [ ' authorIDs ' ]
break
except HTTPError as e :
tries + = 1
if tries > 3 :
print ( " Too many failures ( {0} ), skipping " . format ( padid ) . encode ( " utf-8 " ) , file = sys . stderr )
skip = True
break
else :
sleep ( 3 )
if skip :
continue
count + = 1
if args . output :
print ( padid . encode ( " utf-8 " ) )
if args . all or ( args . meta or args . text or args . html or args . dhtml ) :
try :
os . makedirs ( os . path . split ( metapath ) [ 0 ] )
except OSError :
pass
if args . all or args . text :
text = getjson ( info [ ' apiurl ' ] + ' getText? ' + urlencode ( data ) )
ver = { " type " : " text " }
versions . append ( ver )
ver [ " code " ] = text [ " _code " ]
if text [ " _code " ] == 200 :
text = text [ ' data ' ] [ ' text ' ]
##########################################
## ENFORCE __NOPUBLISH__ MAGIC WORD
##########################################
if args . nopublish and args . nopublish in text :
# NEED TO PURGE ANY EXISTING DOCS
try_deleting ( ( p + " .raw.txt " , p + " .raw.html " , p + " .diff.html " , p + " .meta.json " ) )
continue
ver [ " path " ] = p + " .raw.txt "
ver [ " url " ] = quote ( ver [ " path " ] )
with open ( ver [ " path " ] , " w " ) as f :
f . write ( text . encode ( " utf-8 " ) )
# once the content is settled, compute a hash
# and link it in the metadata!
links = [ ]
links . append ( { " href " : " /styles.css " , " rel " : " stylesheet " } )
# todo, make this process reflect which files actually were made
versionbaseurl = quote ( padid . encode ( " utf-8 " ) )
links . append ( { " href " : versions [ 0 ] [ " url " ] , " rel " : " alternate " , " type " : " text/html " , " title " : " Etherpad " } )
links . append ( { " href " : versionbaseurl + " .raw.txt " , " rel " : " alternate " , " type " : " text/plain " , " title " : " Plain text " } )
links . append ( { " href " : versionbaseurl + " .raw.html " , " rel " : " alternate " , " type " : " text/html " , " title " : " HTML " } )
links . append ( { " href " : versionbaseurl + " .diff.html " , " rel " : " alternate " , " type " : " text/html " , " title " : " HTML with author colors " } )
links . append ( { " href " : versionbaseurl + " .meta.json " , " rel " : " alternate " , " type " : " application/json " , " title " : " Meta data " } )
links . append ( { " href " : " / " , " rel " : " search " , " type " : " text/html " , " title " : " Index " } )
if args . all or args . dhtml :
data [ ' startRev ' ] = " 0 "
html = getjson ( info [ ' apiurl ' ] + ' createDiffHTML? ' + urlencode ( data ) )
ver = { " type " : " diffhtml " }
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
if html [ " _code " ] == 200 :
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .diff.html "
ver [ " url " ] = quote ( ver [ " path " ] )
doc = html5lib . parse ( html . encode ( " utf-8 " ) , encoding = " utf-8 " , namespaceHTMLElements = False )
html5tidy ( doc , indent = True , title = padid , scripts = " /versions.js " , links = links )
with open ( ver [ " path " ] , " w " ) as f :
# f.write(html.encode("utf-8"))
print ( ET . tostring ( doc , method = " html " , encoding = " utf-8 " ) , file = f )
# Process text, html, dhtml, all options
if args . all or args . html :
html = getjson ( info [ ' apiurl ' ] + ' getHTML? ' + urlencode ( data ) )
ver = { " type " : " html " }
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
if html [ " _code " ] == 200 :
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .raw.html "
ver [ " url " ] = quote ( ver [ " path " ] )
doc = html5lib . parse ( html , namespaceHTMLElements = False )
html5tidy ( doc , indent = True , title = padid , scripts = " /versions.js " , links = links )
with open ( ver [ " path " ] , " w " ) as f :
# f.write(html.encode("utf-8"))
print ( ET . tostring ( doc , method = " html " , encoding = " utf-8 " ) , file = f )
# output meta
if args . all or args . meta :
ver = { " type " : " meta " }
versions . append ( ver )
ver [ " path " ] = metapath
ver [ " url " ] = quote ( metapath )
with open ( metapath , " w " ) as f :
json . dump ( meta , f , indent = 2 )
print ( " \n {0} pad(s) loaded " . format ( count ) , file = sys . stderr )