2016-01-08 12:09:05 +01:00
from __future__ import print_function
2015-11-24 13:48:03 +01:00
import re , os , json , sys
from math import ceil , floor
2016-01-08 12:09:05 +01:00
from time import sleep
2018-01-12 14:42:55 +01:00
try :
# python2
from urlparse import urlparse , urlunparse
from urllib2 import urlopen , URLError , HTTPError
from urllib import urlencode
from urllib import quote_plus , unquote_plus
from htmlentitydefs import name2codepoint
input = raw_input
except ImportError :
# python3
from urllib . parse import urlparse , urlunparse , urlencode , quote_plus , unquote_plus
from urllib . request import urlopen , URLError , HTTPError
from html . entities import name2codepoint
2015-11-13 11:03:57 +01:00
groupnamepat = re . compile ( r " ^g \ .( \ w+) \ $ " )
def splitpadname ( padid ) :
m = groupnamepat . match ( padid )
if m :
return ( m . group ( 1 ) , padid [ m . end ( ) : ] )
else :
return ( u " " , padid )
2015-12-04 17:17:32 +01:00
def padurl ( padid , ) :
return padid
2018-01-12 14:42:55 +01:00
def padpath ( padid , pub_path = u " " , group_path = u " " , normalize = False ) :
2015-11-13 11:03:57 +01:00
g , p = splitpadname ( padid )
2018-01-12 14:42:55 +01:00
# if type(g) == unicode:
# g = g.encode("utf-8")
# if type(p) == unicode:
# p = p.encode("utf-8")
2015-11-13 11:03:57 +01:00
p = quote_plus ( p )
2018-01-12 14:42:55 +01:00
if normalize :
p = p . replace ( " " , " _ " )
p = p . replace ( " ( " , " " )
p = p . replace ( " ) " , " " )
p = p . replace ( " ? " , " " )
p = p . replace ( " ' " , " " )
2015-11-13 11:03:57 +01:00
if g :
return os . path . join ( group_path , g , p )
else :
return os . path . join ( pub_path , p )
2015-11-24 13:48:03 +01:00
def padpath2id ( path ) :
if type ( path ) == unicode :
path = path . encode ( " utf-8 " )
dd , p = os . path . split ( path )
gname = dd . split ( " / " ) [ - 1 ]
p = unquote_plus ( p )
if gname :
return " {0} $ {1} " . format ( gname , p ) . decode ( " utf-8 " )
else :
return p . decode ( " utf-8 " )
2016-01-08 12:09:05 +01:00
def getjson ( url , max_retry = 3 , retry_sleep_time = 0.5 ) :
ret = { }
ret [ " _retries " ] = 0
while ret [ " _retries " ] < = max_retry :
try :
f = urlopen ( url )
data = f . read ( )
2018-01-12 14:42:55 +01:00
data = data . decode ( " utf-8 " )
2016-01-08 12:09:05 +01:00
rurl = f . geturl ( )
f . close ( )
ret . update ( json . loads ( data ) )
ret [ " _code " ] = f . getcode ( )
if rurl != url :
ret [ " _url " ] = rurl
return ret
2016-03-02 23:27:04 +01:00
except ValueError as e :
url = " http://localhost " + url
2016-01-08 12:09:05 +01:00
except HTTPError as e :
print ( " HTTPError {0} " . format ( e ) , file = sys . stderr )
ret [ " _code " ] = e . code
ret [ " _retries " ] + = 1
if retry_sleep_time :
sleep ( retry_sleep_time )
return ret
2015-11-24 13:48:03 +01:00
def loadpadinfo ( p ) :
with open ( p ) as f :
info = json . load ( f )
return info
def progressbar ( i , num , label = " " , file = sys . stderr ) :
p = float ( i ) / num
percentage = int ( floor ( p * 100 ) )
bars = int ( ceil ( p * 20 ) )
bar = ( " * " * bars ) + ( " - " * ( 20 - bars ) )
msg = u " \r {0} {1} / {2} {3} ... " . format ( bar , ( i + 1 ) , num , label )
2018-01-12 14:42:55 +01:00
sys . stderr . write ( msg )
2015-11-24 13:48:03 +01:00
sys . stderr . flush ( )
2016-01-15 12:06:21 +01:00
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape ( text ) :
def fixup ( m ) :
text = m . group ( 0 )
if text [ : 2 ] == " &# " :
# character reference
try :
if text [ : 3 ] == " &#x " :
return unichr ( int ( text [ 3 : - 1 ] , 16 ) )
else :
return unichr ( int ( text [ 2 : - 1 ] ) )
except ValueError :
pass
else :
# named entity
try :
2018-01-12 14:42:55 +01:00
text = unichr ( name2codepoint [ text [ 1 : - 1 ] ] )
2016-01-15 12:06:21 +01:00
except KeyError :
pass
return text # leave as is
return re . sub ( " &#? \ w+; " , fixup , text )