2015-11-13 11:03:57 +01:00
from __future__ import print_function
from argparse import ArgumentParser
import sys , json , re , os
from datetime import datetime
2018-01-12 14:42:55 +01:00
try :
# python2
from urllib2 import urlopen , URLError , HTTPError
from urllib import urlencode
except ImportError :
# python3
from urllib . parse import urlencode , quote
from urllib . request import urlopen , URLError , HTTPError
from etherdump . commands . common import *
2015-12-03 14:08:25 +01:00
from time import sleep
2018-01-12 14:42:55 +01:00
from etherdump . commands . html5tidy import html5tidy
2016-01-14 18:29:34 +01:00
import html5lib
2018-01-19 12:48:57 +01:00
from xml . etree import ElementTree as ET
from fnmatch import fnmatch
2016-01-15 12:06:21 +01:00
# debugging
# import ElementTree as ET
2015-11-13 11:03:57 +01:00
"""
2015-11-24 13:48:03 +01:00
pull ( meta ) :
2015-11-13 11:03:57 +01:00
Update meta data files for those that have changed .
Check for changed pads by looking at revisions & comparing to existing
2015-12-04 17:17:32 +01:00
todo . . .
use / prefer public interfaces ? ( export functions )
2015-11-13 11:03:57 +01:00
"""
2016-01-15 16:44:49 +01:00
def try_deleting ( files ) :
for f in files :
try :
os . remove ( f )
except OSError as e :
pass
2015-11-13 11:03:57 +01:00
def main ( args ) :
2015-11-19 13:14:16 +01:00
p = ArgumentParser ( " Check for pads that have changed since last sync (according to .meta.json) " )
2018-01-19 12:48:57 +01:00
2015-11-13 14:21:38 +01:00
p . add_argument ( " padid " , nargs = " * " , default = [ ] )
2018-01-19 12:48:57 +01:00
p . add_argument ( " --glob " , default = False , help = " download pads matching a glob pattern " )
2015-11-22 21:59:52 +01:00
p . add_argument ( " --padinfo " , default = " .etherdump/settings.json " , help = " settings, default: .etherdump/settings.json " )
2015-11-19 13:14:16 +01:00
p . add_argument ( " --zerorevs " , default = False , action = " store_true " , help = " include pads with zero revisions, default: False (i.e. pads with no revisions are skipped) " )
2016-01-08 12:10:58 +01:00
p . add_argument ( " --pub " , default = " p " , help = " folder to store files for public pads, default: p " )
2015-11-19 13:14:16 +01:00
p . add_argument ( " --group " , default = " g " , help = " folder to store files for group pads, default: g " )
2015-11-13 11:03:57 +01:00
p . add_argument ( " --skip " , default = None , type = int , help = " skip this many items, default: None " )
2015-11-19 13:14:16 +01:00
p . add_argument ( " --meta " , default = False , action = " store_true " , help = " download meta to PADID.meta.json, default: False " )
p . add_argument ( " --text " , default = False , action = " store_true " , help = " download text to PADID.txt, default: False " )
p . add_argument ( " --html " , default = False , action = " store_true " , help = " download html to PADID.html, default: False " )
2016-10-17 15:40:16 +02:00
p . add_argument ( " --dhtml " , default = False , action = " store_true " , help = " download dhtml to PADID.diff.html, default: False " )
2015-11-19 12:47:03 +01:00
p . add_argument ( " --all " , default = False , action = " store_true " , help = " download all files (meta, text, html, dhtml), default: False " )
2016-01-08 12:09:05 +01:00
p . add_argument ( " --folder " , default = False , action = " store_true " , help = " dump files in a folder named PADID (meta, text, html, dhtml), default: False " )
2015-12-04 17:17:32 +01:00
p . add_argument ( " --output " , default = False , action = " store_true " , help = " output changed padids on stdout " )
p . add_argument ( " --force " , default = False , action = " store_true " , help = " reload, even if revisions count matches previous " )
2017-06-05 20:05:05 +02:00
p . add_argument ( " --no-raw-ext " , default = False , action = " store_true " , help = " save plain text as padname with no (additional) extension " )
2018-01-12 14:42:55 +01:00
p . add_argument ( " --fix-names " , default = False , action = " store_true " , help = " normalize padid ' s (no spaces, special control chars) for use in file names " )
p . add_argument ( " --filter-ext " , default = None , help = " filter pads by extension " )
2016-01-15 16:32:12 +01:00
2016-03-03 00:02:23 +01:00
p . add_argument ( " --css " , default = " /styles.css " , help = " add css url to output pages, default: /styles.css " )
p . add_argument ( " --script " , default = " /versions.js " , help = " add script url to output pages, default: /versions.js " )
2016-01-15 16:32:12 +01:00
p . add_argument ( " --nopublish " , default = " __NOPUBLISH__ " , help = " no publish magic word, default: __NOPUBLISH__ " )
2019-03-06 10:17:18 +01:00
p . add_argument ( " --publish " , default = " __PUBLISH__ " , help = " the publish magic word, default: __PUBLISH__ " )
p . add_argument ( " --publish-opt-in " , default = False , action = " store_true " , help = " ensure `--publish` is honoured instead of `--nopublish` " )
2016-01-15 16:32:12 +01:00
2015-11-13 11:03:57 +01:00
args = p . parse_args ( args )
2017-06-05 20:05:05 +02:00
raw_ext = " .raw.txt "
if args . no_raw_ext :
raw_ext = " "
2015-11-24 13:48:03 +01:00
info = loadpadinfo ( args . padinfo )
2015-11-13 11:03:57 +01:00
data = { }
data [ ' apikey ' ] = info [ ' apikey ' ]
2015-11-13 14:21:38 +01:00
if args . padid :
2018-01-19 12:48:57 +01:00
padids = args . padid
elif args . glob :
2018-09-04 17:50:16 +02:00
padids = getjson ( info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data ) ) [ ' data ' ] [ ' padIDs ' ]
2018-01-19 12:48:57 +01:00
padids = [ x for x in padids if fnmatch ( x , args . glob ) ]
2015-11-13 14:21:38 +01:00
else :
2018-09-04 17:50:16 +02:00
padids = getjson ( info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data ) ) [ ' data ' ] [ ' padIDs ' ]
2015-11-13 11:03:57 +01:00
padids . sort ( )
numpads = len ( padids )
2015-11-19 12:47:03 +01:00
# maxmsglen = 0
2015-11-13 11:03:57 +01:00
count = 0
for i , padid in enumerate ( padids ) :
if args . skip != None and i < args . skip :
continue
2015-11-24 13:48:03 +01:00
progressbar ( i , numpads , padid )
2015-11-19 12:47:03 +01:00
2015-11-13 11:03:57 +01:00
data [ ' padID ' ] = padid . encode ( " utf-8 " )
2018-01-12 14:42:55 +01:00
p = padpath ( padid , args . pub , args . group , args . fix_names )
2015-12-04 17:17:32 +01:00
if args . folder :
p = os . path . join ( p , padid . encode ( " utf-8 " ) )
2015-11-13 11:03:57 +01:00
metapath = p + " .meta.json "
revisions = None
2015-11-19 12:47:03 +01:00
tries = 1
skip = False
2015-12-04 17:17:32 +01:00
padurlbase = re . sub ( r " api/1.2.9/$ " , " p/ " , info [ " apiurl " ] )
2016-01-08 12:09:05 +01:00
meta = { }
2018-01-12 14:42:55 +01:00
# if type(padurlbase) == unicode:
# padurlbase = padurlbase.encode("utf-8")
2015-11-19 12:47:03 +01:00
while True :
try :
if os . path . exists ( metapath ) :
with open ( metapath ) as f :
2016-01-08 12:09:05 +01:00
meta . update ( json . load ( f ) )
2018-09-04 17:50:16 +02:00
revisions = getjson ( info [ ' localapiurl ' ] + ' getRevisionsCount? ' + urlencode ( data ) ) [ ' data ' ] [ ' revisions ' ]
2015-12-04 17:17:32 +01:00
if meta [ ' revisions ' ] == revisions and not args . force :
2015-11-19 12:47:03 +01:00
skip = True
break
2018-01-12 14:42:55 +01:00
meta [ ' padid ' ] = padid # .encode("utf-8")
2016-01-08 12:09:05 +01:00
versions = meta [ " versions " ] = [ ]
versions . append ( {
2018-01-12 14:42:55 +01:00
" url " : padurlbase + quote ( padid ) ,
2016-01-08 12:09:05 +01:00
" type " : " pad " ,
" code " : 200
} )
2015-12-04 17:17:32 +01:00
2015-11-19 12:47:03 +01:00
if revisions == None :
2018-09-04 17:50:16 +02:00
meta [ ' revisions ' ] = getjson ( info [ ' localapiurl ' ] + ' getRevisionsCount? ' + urlencode ( data ) ) [ ' data ' ] [ ' revisions ' ]
2015-11-19 12:47:03 +01:00
else :
meta [ ' revisions ' ] = revisions
if ( meta [ ' revisions ' ] == 0 ) and ( not args . zerorevs ) :
# print("Skipping zero revs", file=sys.stderr)
skip = True
break
# todo: load more metadata!
2015-12-04 17:17:32 +01:00
meta [ ' group ' ] , meta [ ' pad ' ] = splitpadname ( padid )
2015-11-19 12:47:03 +01:00
meta [ ' pathbase ' ] = p
2018-09-04 17:50:16 +02:00
meta [ ' lastedited_raw ' ] = int ( getjson ( info [ ' localapiurl ' ] + ' getLastEdited? ' + urlencode ( data ) ) [ ' data ' ] [ ' lastEdited ' ] )
2015-11-19 12:47:03 +01:00
meta [ ' lastedited_iso ' ] = datetime . fromtimestamp ( int ( meta [ ' lastedited_raw ' ] ) / 1000 ) . isoformat ( )
2018-09-04 17:50:16 +02:00
meta [ ' author_ids ' ] = getjson ( info [ ' localapiurl ' ] + ' listAuthorsOfPad? ' + urlencode ( data ) ) [ ' data ' ] [ ' authorIDs ' ]
2015-11-19 12:47:03 +01:00
break
except HTTPError as e :
tries + = 1
if tries > 3 :
2018-01-12 14:42:55 +01:00
print ( " Too many failures ( {0} ), skipping " . format ( padid ) , file = sys . stderr )
2015-11-19 12:47:03 +01:00
skip = True
break
2015-12-04 17:17:32 +01:00
else :
sleep ( 3 )
2016-10-24 09:29:22 +02:00
except TypeError as e :
2018-01-12 14:42:55 +01:00
print ( " Type Error loading pad {0} (phantom pad?), skipping " . format ( padid ) , file = sys . stderr )
2016-10-24 09:29:22 +02:00
skip = True
break
2015-11-19 12:47:03 +01:00
if skip :
2015-11-13 11:03:57 +01:00
continue
count + = 1
2015-11-19 12:47:03 +01:00
2015-12-04 17:17:32 +01:00
if args . output :
2018-01-12 14:42:55 +01:00
print ( padid )
2015-11-19 12:47:03 +01:00
if args . all or ( args . meta or args . text or args . html or args . dhtml ) :
try :
os . makedirs ( os . path . split ( metapath ) [ 0 ] )
except OSError :
pass
2016-01-08 12:09:05 +01:00
if args . all or args . text :
2018-09-04 17:50:16 +02:00
text = getjson ( info [ ' localapiurl ' ] + ' getText? ' + urlencode ( data ) )
2016-01-08 12:09:05 +01:00
ver = { " type " : " text " }
versions . append ( ver )
ver [ " code " ] = text [ " _code " ]
if text [ " _code " ] == 200 :
text = text [ ' data ' ] [ ' text ' ]
2016-01-15 16:44:49 +01:00
##########################################
## ENFORCE __NOPUBLISH__ MAGIC WORD
##########################################
if args . nopublish and args . nopublish in text :
# NEED TO PURGE ANY EXISTING DOCS
2017-06-05 20:05:05 +02:00
try_deleting ( ( p + raw_ext , p + " .raw.html " , p + " .diff.html " , p + " .meta.json " ) )
2016-01-15 16:44:49 +01:00
continue
2019-03-06 10:17:18 +01:00
##########################################
## ENFORCE __PUBLISH__ MAGIC WORD
##########################################
if args . publish_opt_in and args . publish not in text :
try_deleting ( ( p + raw_ext , p + " .raw.html " , p + " .diff.html " , p + " .meta.json " ) )
continue
2017-06-05 20:05:05 +02:00
ver [ " path " ] = p + raw_ext
2016-01-08 12:09:05 +01:00
ver [ " url " ] = quote ( ver [ " path " ] )
with open ( ver [ " path " ] , " w " ) as f :
2018-01-12 14:42:55 +01:00
f . write ( text )
2016-01-08 12:09:05 +01:00
# once the content is settled, compute a hash
# and link it in the metadata!
2015-11-19 12:47:03 +01:00
2016-01-14 18:29:34 +01:00
links = [ ]
2016-03-03 00:02:23 +01:00
if args . css :
links . append ( { " href " : args . css , " rel " : " stylesheet " } )
2016-01-14 18:29:34 +01:00
# todo, make this process reflect which files actually were made
2018-01-12 14:42:55 +01:00
versionbaseurl = quote ( padid )
2016-01-14 18:29:34 +01:00
links . append ( { " href " : versions [ 0 ] [ " url " ] , " rel " : " alternate " , " type " : " text/html " , " title " : " Etherpad " } )
2018-02-21 15:17:58 +01:00
if args . all or args . text :
links . append ( { " href " : versionbaseurl + raw_ext , " rel " : " alternate " , " type " : " text/plain " , " title " : " Plain text " } )
if args . all or args . html :
links . append ( { " href " : versionbaseurl + " .raw.html " , " rel " : " alternate " , " type " : " text/html " , " title " : " HTML " } )
if args . all or args . dhtml :
links . append ( { " href " : versionbaseurl + " .diff.html " , " rel " : " alternate " , " type " : " text/html " , " title " : " HTML with author colors " } )
if args . all or args . meta :
links . append ( { " href " : versionbaseurl + " .meta.json " , " rel " : " alternate " , " type " : " application/json " , " title " : " Meta data " } )
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
2016-01-14 18:29:34 +01:00
2015-11-19 12:47:03 +01:00
if args . all or args . dhtml :
2016-01-08 12:09:05 +01:00
data [ ' startRev ' ] = " 0 "
2018-09-04 17:50:16 +02:00
html = getjson ( info [ ' localapiurl ' ] + ' createDiffHTML? ' + urlencode ( data ) )
2016-01-08 12:09:05 +01:00
ver = { " type " : " diffhtml " }
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
if html [ " _code " ] == 200 :
2018-05-04 17:33:17 +02:00
try :
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .diff.html "
ver [ " url " ] = quote ( ver [ " path " ] )
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib . parse ( html , treebuilder = " etree " , namespaceHTMLElements = False )
html5tidy ( doc , indent = True , title = padid , scripts = args . script , links = links )
with open ( ver [ " path " ] , " w " ) as f :
# f.write(html.encode("utf-8"))
print ( ET . tostring ( doc , method = " html " , encoding = " unicode " ) , file = f )
except TypeError :
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver [ " message " ] = html [ " message " ]
# with open(ver["path"], "w") as f:
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
2016-01-14 18:29:34 +01:00
# Process text, html, dhtml, all options
if args . all or args . html :
2018-09-04 17:50:16 +02:00
html = getjson ( info [ ' localapiurl ' ] + ' getHTML? ' + urlencode ( data ) )
2016-01-14 18:29:34 +01:00
ver = { " type " : " html " }
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
if html [ " _code " ] == 200 :
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .raw.html "
ver [ " url " ] = quote ( ver [ " path " ] )
2016-10-21 13:04:15 +02:00
doc = html5lib . parse ( html , treebuilder = " etree " , namespaceHTMLElements = False )
2016-03-03 00:02:23 +01:00
html5tidy ( doc , indent = True , title = padid , scripts = args . script , links = links )
2016-01-08 12:09:05 +01:00
with open ( ver [ " path " ] , " w " ) as f :
2016-01-14 18:29:34 +01:00
# f.write(html.encode("utf-8"))
2018-01-12 14:42:55 +01:00
print ( ET . tostring ( doc , method = " html " , encoding = " unicode " ) , file = f )
2016-01-08 12:09:05 +01:00
# output meta
if args . all or args . meta :
ver = { " type " : " meta " }
versions . append ( ver )
ver [ " path " ] = metapath
ver [ " url " ] = quote ( metapath )
with open ( metapath , " w " ) as f :
json . dump ( meta , f , indent = 2 )
2015-11-19 12:47:03 +01:00
2015-12-04 17:17:32 +01:00
print ( " \n {0} pad(s) loaded " . format ( count ) , file = sys . stderr )