@ -1,18 +1,20 @@
""" Check for pads that have changed since last sync (according to .meta.json) """
""" Check for pads that have changed since last sync (according to .meta.json) """
import json
import json
import math
import os
import os
import re
import re
import sys
import sys
from argparse import ArgumentParser
from argparse import ArgumentParser
from datetime import datetime
from datetime import datetime
from fnmatch import fnmatch
from fnmatch import fnmatch
from time import sleep
from urllib . parse import quote , urlencode
from urllib . parse import quote , urlencode
from urllib . request import HTTPError
from urllib . request import HTTPError
from xml . etree import ElementTree as ET
from xml . etree import ElementTree as ET
import asks
import html5lib
import html5lib
import trio
from tqdm import tqdm
from tqdm import tqdm
from etherpump . commands . common import * # noqa
from etherpump . commands . common import * # noqa
@ -36,170 +38,160 @@ def try_deleting(files):
pass
pass
def main ( args ) :
def build_argument_parser ( args ) :
p = ArgumentParser (
parser = ArgumentParser (
" Check for pads that have changed since last sync (according to .meta.json) "
" Check for pads that have changed since last sync (according to .meta.json) "
)
)
parser . add_argument ( " padid " , nargs = " * " , default = [ ] )
p . add_argument ( " padid " , nargs = " * " , default = [ ] )
parser . add_argument (
p . add_argument (
" --glob " , default = False , help = " download pads matching a glob pattern "
" --glob " , default = False , help = " download pads matching a glob pattern "
)
)
parser . add_argument (
p . add_argument (
" --padinfo " ,
" --padinfo " ,
default = " .etherpump/settings.json " ,
default = " .etherpump/settings.json " ,
help = " settings, default: .etherpump/settings.json " ,
help = " settings, default: .etherpump/settings.json " ,
)
)
p . add_argument (
parser . add_argument (
" --zerorevs " ,
" --zerorevs " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " include pads with zero revisions, default: False (i.e. pads with no revisions are skipped) " ,
help = " include pads with zero revisions, default: False (i.e. pads with no revisions are skipped) " ,
)
)
p . add_argument (
parser . add_argument (
" --pub " ,
" --pub " ,
default = " p " ,
default = " p " ,
help = " folder to store files for public pads, default: p " ,
help = " folder to store files for public pads, default: p " ,
)
)
p . add_argument (
parser . add_argument (
" --group " ,
" --group " ,
default = " g " ,
default = " g " ,
help = " folder to store files for group pads, default: g " ,
help = " folder to store files for group pads, default: g " ,
)
)
p . add_argument (
parser . add_argument (
" --skip " ,
" --skip " ,
default = None ,
default = None ,
type = int ,
type = int ,
help = " skip this many items, default: None " ,
help = " skip this many items, default: None " ,
)
)
p . add_argument (
parser . add_argument (
" --connection " ,
default = 5 ,
type = int ,
help = " number of connections to run concurrently " ,
)
parser . add_argument (
" --meta " ,
" --meta " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " download meta to PADID.meta.json, default: False " ,
help = " download meta to PADID.meta.json, default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --text " ,
" --text " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " download text to PADID.txt, default: False " ,
help = " download text to PADID.txt, default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --html " ,
" --html " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " download html to PADID.html, default: False " ,
help = " download html to PADID.html, default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --dhtml " ,
" --dhtml " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " download dhtml to PADID.diff.html, default: False " ,
help = " download dhtml to PADID.diff.html, default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --all " ,
" --all " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " download all files (meta, text, html, dhtml), default: False " ,
help = " download all files (meta, text, html, dhtml), default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --folder " ,
" --folder " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " dump files in a folder named PADID (meta, text, html, dhtml), default: False " ,
help = " dump files in a folder named PADID (meta, text, html, dhtml), default: False " ,
)
)
p . add_argument (
parser . add_argument (
" --output " ,
" --output " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " output changed padids on stdout " ,
help = " output changed padids on stdout " ,
)
)
p . add_argument (
parser . add_argument (
" --force " ,
" --force " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " reload, even if revisions count matches previous " ,
help = " reload, even if revisions count matches previous " ,
)
)
p . add_argument (
parser . add_argument (
" --no-raw-ext " ,
" --no-raw-ext " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " save plain text as padname with no (additional) extension " ,
help = " save plain text as padname with no (additional) extension " ,
)
)
p . add_argument (
parser . add_argument (
" --fix-names " ,
" --fix-names " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " normalize padid ' s (no spaces, special control chars) for use in file names " ,
help = " normalize padid ' s (no spaces, special control chars) for use in file names " ,
)
)
parser . add_argument (
p . add_argument (
" --filter-ext " , default = None , help = " filter pads by extension "
" --filter-ext " , default = None , help = " filter pads by extension "
)
)
parser . add_argument (
p . add_argument (
" --css " ,
" --css " ,
default = " /styles.css " ,
default = " /styles.css " ,
help = " add css url to output pages, default: /styles.css " ,
help = " add css url to output pages, default: /styles.css " ,
)
)
p . add_argument (
parser . add_argument (
" --script " ,
" --script " ,
default = " /versions.js " ,
default = " /versions.js " ,
help = " add script url to output pages, default: /versions.js " ,
help = " add script url to output pages, default: /versions.js " ,
)
)
parser . add_argument (
p . add_argument (
" --nopublish " ,
" --nopublish " ,
default = " __NOPUBLISH__ " ,
default = " __NOPUBLISH__ " ,
help = " no publish magic word, default: __NOPUBLISH__ " ,
help = " no publish magic word, default: __NOPUBLISH__ " ,
)
)
p . add_argument (
parser . add_argument (
" --publish " ,
" --publish " ,
default = " __PUBLISH__ " ,
default = " __PUBLISH__ " ,
help = " the publish magic word, default: __PUBLISH__ " ,
help = " the publish magic word, default: __PUBLISH__ " ,
)
)
p . add_argument (
parser . add_argument (
" --publish-opt-in " ,
" --publish-opt-in " ,
default = False ,
default = False ,
action = " store_true " ,
action = " store_true " ,
help = " ensure `--publish` is honoured instead of `--nopublish` " ,
help = " ensure `--publish` is honoured instead of `--nopublish` " ,
)
)
return parser
args = p . parse_args ( args )
raw_ext = " .raw.txt "
if args . no_raw_ext :
raw_ext = " "
info = loadpadinfo ( args . padinfo )
data = { }
data [ ' apikey ' ] = info [ ' apikey ' ]
async def get_padids ( args , info , data , session ) :
if args . padid :
if args . padid :
padids = args . padid
padids = args . padid
elif args . glob :
elif args . glob :
padids = getjson (
url = info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data )
info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data )
padids = await agetjson ( session , url )
) [ ' data ' ] [ ' padIDs ' ]
padids = padids [ ' data ' ] [ ' padIDs ' ]
padids = [ x for x in padids if fnmatch ( x , args . glob ) ]
padids = [ x for x in padids if fnmatch ( x , args . glob ) ]
else :
else :
padids = getjson (
url = info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data )
info [ ' localapiurl ' ] + ' listAllPads? ' + urlencode ( data )
padids = await agetjson ( session , url )
) [ ' data ' ] [ ' padIDs ' ]
padids = padids [ ' data ' ] [ ' padIDs ' ]
padids . sort ( )
padids . sort ( )
numpads = len ( padids )
return padids
# maxmsglen = 0
count = 0
progress_kwargs = { }
if not istty ( ) :
progress_kwargs . update ( dict ( disable = True ) )
progress_pads = tqdm ( iterable = padids , total = len ( padids ) , * * progress_kwargs )
for i , padid in enumerate ( progress_pads ) :
async def handle_pad ( args , padid , data , info , session ) :
if args . skip != None and i < args . skip :
raw_ext = " .raw.txt "
continue
if args . no_raw_ext :
raw_ext = " "
data [ ' padID ' ] = padid
data [ ' padID ' ] = padid
p = padpath ( padid , args . pub , args . group , args . fix_names )
p = padpath ( padid , args . pub , args . group , args . fix_names )
@ -218,11 +210,11 @@ def main(args):
if os . path . exists ( metapath ) :
if os . path . exists ( metapath ) :
with open ( metapath ) as f :
with open ( metapath ) as f :
meta . update ( json . load ( f ) )
meta . update ( json . load ( f ) )
revisions = getjson (
url = (
info [ ' localapiurl ' ]
info [ ' localapiurl ' ] + ' getRevisionsCount? ' + urlencode ( data )
+ ' getRevisionsCount? '
)
+ urlencode ( data )
response = await agetjson ( session , url )
) [ ' data ' ] [ ' revisions ' ]
revisions = response [ ' data ' ] [ ' revisions ' ]
if meta [ ' revisions ' ] == revisions and not args . force :
if meta [ ' revisions ' ] == revisions and not args . force :
skip = True
skip = True
break
break
@ -230,19 +222,15 @@ def main(args):
meta [ ' padid ' ] = padid
meta [ ' padid ' ] = padid
versions = meta [ " versions " ] = [ ]
versions = meta [ " versions " ] = [ ]
versions . append (
versions . append (
{
{ " url " : padurlbase + quote ( padid ) , " type " : " pad " , " code " : 200 , }
" url " : padurlbase + quote ( padid ) ,
" type " : " pad " ,
" code " : 200 ,
}
)
)
if revisions == None :
if revisions is None :
meta [ ' revisions ' ] = getjson (
url = (
info [ ' localapiurl ' ]
info [ ' localapiurl ' ] + ' getRevisionsCount? ' + urlencode ( data )
+ ' getRevisionsCount? '
)
+ urlencode ( data )
response = await agetjson ( session , url )
) [ ' data ' ] [ ' revisions ' ]
meta [ ' revisions ' ] = response [ ' data ' ] [ ' revisions ' ]
else :
else :
meta [ ' revisions ' ] = revisions
meta [ ' revisions ' ] = revisions
@ -253,17 +241,19 @@ def main(args):
# todo: load more metadata!
# todo: load more metadata!
meta [ ' group ' ] , meta [ ' pad ' ] = splitpadname ( padid )
meta [ ' group ' ] , meta [ ' pad ' ] = splitpadname ( padid )
meta [ ' pathbase ' ] = p
meta [ ' pathbase ' ] = p
meta [ ' lastedited_raw ' ] = int (
getjson (
url = info [ ' localapiurl ' ] + ' getLastEdited? ' + urlencode ( data )
info [ ' localapiurl ' ] + ' getLastEdited? ' + urlencode ( data )
response = await agetjson ( session , url )
) [ ' data ' ] [ ' lastEdited ' ]
meta [ ' lastedited_raw ' ] = int ( response [ ' data ' ] [ ' lastEdited ' ] )
)
meta [ ' lastedited_iso ' ] = datetime . fromtimestamp (
meta [ ' lastedited_iso ' ] = datetime . fromtimestamp (
int ( meta [ ' lastedited_raw ' ] ) / 1000
int ( meta [ ' lastedited_raw ' ] ) / 1000
) . isoformat ( )
) . isoformat ( )
meta [ ' author_ids ' ] = getjson (
info [ ' localapiurl ' ] + ' listAuthorsOfPad? ' + urlencode ( data )
url = info [ ' localapiurl ' ] + ' listAuthorsOfPad? ' + urlencode ( data )
) [ ' data ' ] [ ' authorIDs ' ]
response = await agetjson ( session , url )
meta [ ' author_ids ' ] = response [ ' data ' ] [ ' authorIDs ' ]
break
break
except HTTPError as e :
except HTTPError as e :
tries + = 1
tries + = 1
@ -275,7 +265,7 @@ def main(args):
skip = True
skip = True
break
break
else :
else :
sleep ( 3 )
await trio . sleep ( 3 )
except TypeError as e :
except TypeError as e :
print (
print (
" Type Error loading pad {0} (phantom pad?), skipping " . format (
" Type Error loading pad {0} (phantom pad?), skipping " . format (
@ -287,9 +277,7 @@ def main(args):
break
break
if skip :
if skip :
continue
return
count + = 1
if args . output :
if args . output :
print ( padid )
print ( padid )
@ -301,7 +289,8 @@ def main(args):
pass
pass
if args . all or args . text :
if args . all or args . text :
text = getjson ( info [ ' localapiurl ' ] + ' getText? ' + urlencode ( data ) )
url = info [ ' localapiurl ' ] + ' getText? ' + urlencode ( data )
text = await agetjson ( session , url )
ver = { " type " : " text " }
ver = { " type " : " text " }
versions . append ( ver )
versions . append ( ver )
ver [ " code " ] = text [ " _code " ]
ver [ " code " ] = text [ " _code " ]
@ -321,7 +310,7 @@ def main(args):
p + " .meta.json " ,
p + " .meta.json " ,
)
)
)
)
continue
return
##########################################
##########################################
## ENFORCE __PUBLISH__ MAGIC WORD
## ENFORCE __PUBLISH__ MAGIC WORD
@ -335,7 +324,7 @@ def main(args):
p + " .meta.json " ,
p + " .meta.json " ,
)
)
)
)
continue
return
ver [ " path " ] = p + raw_ext
ver [ " path " ] = p + raw_ext
ver [ " url " ] = quote ( ver [ " path " ] )
ver [ " url " ] = quote ( ver [ " path " ] )
@ -394,13 +383,10 @@ def main(args):
}
}
)
)
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
if args . all or args . dhtml :
if args . all or args . dhtml :
data [ ' startRev ' ] = " 0 "
data [ ' startRev ' ] = " 0 "
html = getjson (
url = info [ ' localapiurl ' ] + ' createDiffHTML? ' + urlencode ( data )
info [ ' localapiurl ' ] + ' createDiffHTML? ' + urlencode ( data )
html = await agetjson ( session , url )
)
ver = { " type " : " diffhtml " }
ver = { " type " : " diffhtml " }
versions . append ( ver )
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
ver [ " code " ] = html [ " _code " ]
@ -409,7 +395,6 @@ def main(args):
html = html [ ' data ' ] [ ' html ' ]
html = html [ ' data ' ] [ ' html ' ]
ver [ " path " ] = p + " .diff.html "
ver [ " path " ] = p + " .diff.html "
ver [ " url " ] = quote ( ver [ " path " ] )
ver [ " url " ] = quote ( ver [ " path " ] )
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib . parse (
doc = html5lib . parse (
html , treebuilder = " etree " , namespaceHTMLElements = False
html , treebuilder = " etree " , namespaceHTMLElements = False
)
)
@ -426,14 +411,12 @@ def main(args):
file = f ,
file = f ,
)
)
except TypeError :
except TypeError :
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver [ " message " ] = html [ " message " ]
ver [ " message " ] = html [ " message " ]
# with open(ver["path"], "w") as f:
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
# Process text, html, dhtml, all options
# Process text, html, dhtml, all options
if args . all or args . html :
if args . all or args . html :
html = getjson ( info [ ' localapiurl ' ] + ' getHTML? ' + urlencode ( data ) )
url = info [ ' localapiurl ' ] + ' getHTML? ' + urlencode ( data )
html = await agetjson ( session , url )
ver = { " type " : " html " }
ver = { " type " : " html " }
versions . append ( ver )
versions . append ( ver )
ver [ " code " ] = html [ " _code " ]
ver [ " code " ] = html [ " _code " ]
@ -445,16 +428,11 @@ def main(args):
html , treebuilder = " etree " , namespaceHTMLElements = False
html , treebuilder = " etree " , namespaceHTMLElements = False
)
)
html5tidy (
html5tidy (
doc ,
doc , indent = True , title = padid , scripts = args . script , links = links ,
indent = True ,
title = padid ,
scripts = args . script ,
links = links ,
)
)
with open ( ver [ " path " ] , " w " ) as f :
with open ( ver [ " path " ] , " w " ) as f :
print (
print (
ET . tostring ( doc , method = " html " , encoding = " unicode " ) ,
ET . tostring ( doc , method = " html " , encoding = " unicode " ) , file = f ,
file = f ,
)
)
# output meta
# output meta
@ -465,3 +443,35 @@ def main(args):
ver [ " url " ] = quote ( metapath )
ver [ " url " ] = quote ( metapath )
with open ( metapath , " w " ) as f :
with open ( metapath , " w " ) as f :
json . dump ( meta , f , indent = 2 )
json . dump ( meta , f , indent = 2 )
async def handle_pad_chunk ( args , padids , data , info , session ) :
progress_kwargs = { }
if not istty ( ) :
progress_kwargs . update ( dict ( disable = True ) )
padids = tqdm ( iterable = padids , total = len ( padids ) , * * progress_kwargs , )
for padid in padids :
await handle_pad ( args , padid , data , info , session )
async def handle_pads ( args ) :
session = asks . Session ( connections = args . connection )
info = loadpadinfo ( args . padinfo )
data = { ' apikey ' : info [ ' apikey ' ] }
padids = await get_padids ( args , info , data , session )
if args . skip :
padids = padids [ args . skip : len ( padids ) ]
CHUNK_SIZE = math . ceil ( len ( padids ) / 3 )
async with trio . open_nursery ( ) as nursery :
for padids in chunks ( padids , CHUNK_SIZE ) :
_args = ( args , padids , data , info , session )
nursery . start_soon ( handle_pad_chunk , * _args )
def main ( args ) :
p = build_argument_parser ( args )
args = p . parse_args ( args )
trio . run ( handle_pads , args )