from __future__ import print_function
import re , os , json , sys
from urllib import quote_plus , unquote_plus
from math import ceil , floor
from urllib2 import urlopen , HTTPError
from time import sleep
groupnamepat = re . compile ( r " ^g \ .( \ w+) \ $ " )
def splitpadname ( padid ) :
m = groupnamepat . match ( padid )
if m :
return ( m . group ( 1 ) , padid [ m . end ( ) : ] )
else :
return ( u " " , padid )
def padurl ( padid , ) :
return padid
def padpath ( padid , pub_path = u " " , group_path = u " " ) :
g , p = splitpadname ( padid )
if type ( g ) == unicode :
g = g . encode ( " utf-8 " )
if type ( p ) == unicode :
p = p . encode ( " utf-8 " )
p = quote_plus ( p )
# p = p.replace(" ", "_")
# p = p.replace("*", "-")
if g :
return os . path . join ( group_path , g , p )
else :
return os . path . join ( pub_path , p )
def padpath2id ( path ) :
if type ( path ) == unicode :
path = path . encode ( " utf-8 " )
dd , p = os . path . split ( path )
gname = dd . split ( " / " ) [ - 1 ]
p = unquote_plus ( p )
if gname :
return " {0} $ {1} " . format ( gname , p ) . decode ( " utf-8 " )
else :
return p . decode ( " utf-8 " )
def getjson ( url , max_retry = 3 , retry_sleep_time = 0.5 ) :
ret = { }
ret [ " _retries " ] = 0
while ret [ " _retries " ] < = max_retry :
try :
f = urlopen ( url )
data = f . read ( )
rurl = f . geturl ( )
f . close ( )
ret . update ( json . loads ( data ) )
ret [ " _code " ] = f . getcode ( )
if rurl != url :
ret [ " _url " ] = rurl
return ret
except HTTPError as e :
print ( " HTTPError {0} " . format ( e ) , file = sys . stderr )
ret [ " _code " ] = e . code
ret [ " _retries " ] + = 1
if retry_sleep_time :
sleep ( retry_sleep_time )
return ret
def loadpadinfo ( p ) :
with open ( p ) as f :
info = json . load ( f )
return info
def progressbar ( i , num , label = " " , file = sys . stderr ) :
p = float ( i ) / num
percentage = int ( floor ( p * 100 ) )
bars = int ( ceil ( p * 20 ) )
bar = ( " * " * bars ) + ( " - " * ( 20 - bars ) )
msg = u " \r {0} {1} / {2} {3} ... " . format ( bar , ( i + 1 ) , num , label )
sys . stderr . write ( msg . encode ( " utf-8 " ) )
sys . stderr . flush ( )
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
import re , htmlentitydefs
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape ( text ) :
def fixup ( m ) :
text = m . group ( 0 )
if text [ : 2 ] == " &# " :
# character reference
try :
if text [ : 3 ] == " &#x " :
return unichr ( int ( text [ 3 : - 1 ] , 16 ) )
else :
return unichr ( int ( text [ 2 : - 1 ] ) )
except ValueError :
pass
else :
# named entity
try :
text = unichr ( htmlentitydefs . name2codepoint [ text [ 1 : - 1 ] ] )
except KeyError :
pass
return text # leave as is
return re . sub ( " &#? \ w+; " , fixup , text )