from __future__ import print_function
import re , os , json , sys
from math import ceil , floor
from time import sleep
try :
# python2
from urlparse import urlparse , urlunparse
from urllib2 import urlopen , URLError , HTTPError
from urllib import urlencode
from urllib import quote_plus , unquote_plus
from htmlentitydefs import name2codepoint
input = raw_input
except ImportError :
# python3
from urllib . parse import urlparse , urlunparse , urlencode , quote_plus , unquote_plus
from urllib . request import urlopen , URLError , HTTPError
from html . entities import name2codepoint
groupnamepat = re . compile ( r " ^g \ .( \ w+) \ $ " )
def splitpadname ( padid ) :
m = groupnamepat . match ( padid )
if m :
return ( m . group ( 1 ) , padid [ m . end ( ) : ] )
else :
return ( u " " , padid )
def padurl ( padid , ) :
return padid
def padpath ( padid , pub_path = u " " , group_path = u " " , normalize = False ) :
g , p = splitpadname ( padid )
# if type(g) == unicode:
# g = g.encode("utf-8")
# if type(p) == unicode:
# p = p.encode("utf-8")
p = quote_plus ( p )
if normalize :
p = p . replace ( " " , " _ " )
p = p . replace ( " ( " , " " )
p = p . replace ( " ) " , " " )
p = p . replace ( " ? " , " " )
p = p . replace ( " ' " , " " )
if g :
return os . path . join ( group_path , g , p )
else :
return os . path . join ( pub_path , p )
def padpath2id ( path ) :
if type ( path ) == unicode :
path = path . encode ( " utf-8 " )
dd , p = os . path . split ( path )
gname = dd . split ( " / " ) [ - 1 ]
p = unquote_plus ( p )
if gname :
return " {0} $ {1} " . format ( gname , p ) . decode ( " utf-8 " )
else :
return p . decode ( " utf-8 " )
def getjson ( url , max_retry = 3 , retry_sleep_time = 3 ) :
ret = { }
ret [ " _retries " ] = 0
while ret [ " _retries " ] < = max_retry :
try :
f = urlopen ( url )
data = f . read ( )
data = data . decode ( " utf-8 " )
rurl = f . geturl ( )
f . close ( )
ret . update ( json . loads ( data ) )
ret [ " _code " ] = f . getcode ( )
if rurl != url :
ret [ " _url " ] = rurl
return ret
except ValueError as e :
url = " http://localhost " + url
except HTTPError as e :
print ( " HTTPError {0} " . format ( e ) , file = sys . stderr )
ret [ " _code " ] = e . code
ret [ " _retries " ] + = 1
if retry_sleep_time :
sleep ( retry_sleep_time )
return ret
def loadpadinfo ( p ) :
with open ( p ) as f :
info = json . load ( f )
return info
def progressbar ( i , num , label = " " , file = sys . stderr ) :
p = float ( i ) / num
percentage = int ( floor ( p * 100 ) )
bars = int ( ceil ( p * 20 ) )
bar = ( " * " * bars ) + ( " - " * ( 20 - bars ) )
msg = u " \r {0} {1} / {2} {3} ... " . format ( bar , ( i + 1 ) , num , label )
sys . stderr . write ( msg )
sys . stderr . flush ( )
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape ( text ) :
def fixup ( m ) :
text = m . group ( 0 )
if text [ : 2 ] == " &# " :
# character reference
try :
if text [ : 3 ] == " &#x " :
return unichr ( int ( text [ 3 : - 1 ] , 16 ) )
else :
return unichr ( int ( text [ 2 : - 1 ] ) )
except ValueError :
pass
else :
# named entity
try :
text = unichr ( name2codepoint [ text [ 1 : - 1 ] ] )
except KeyError :
pass
return text # leave as is
return re . sub ( " &#? \ w+; " , fixup , text )