from __future__ import print_function import re, os, json, sys from math import ceil, floor from time import sleep try: # python2 from urlparse import urlparse, urlunparse from urllib2 import urlopen, URLError, HTTPError from urllib import urlencode from urllib import quote_plus, unquote_plus from htmlentitydefs import name2codepoint input = raw_input except ImportError: # python3 from urllib.parse import urlparse, urlunparse, urlencode, quote_plus, unquote_plus from urllib.request import urlopen, URLError, HTTPError from html.entities import name2codepoint groupnamepat = re.compile(r"^g\.(\w+)\$") def splitpadname (padid): m = groupnamepat.match(padid) if m: return(m.group(1), padid[m.end():]) else: return (u"", padid) def padurl (padid, ): return padid def padpath (padid, pub_path=u"", group_path=u"", normalize=False): g, p = splitpadname(padid) # if type(g) == unicode: # g = g.encode("utf-8") # if type(p) == unicode: # p = p.encode("utf-8") p = quote_plus(p) if normalize: p = p.replace(" ", "_") p = p.replace("(", "") p = p.replace(")", "") p = p.replace("?", "") p = p.replace("'", "") if g: return os.path.join(group_path, g, p) else: return os.path.join(pub_path, p) def padpath2id (path): if type(path) == unicode: path = path.encode("utf-8") dd, p = os.path.split(path) gname = dd.split("/")[-1] p = unquote_plus(p) if gname: return "{0}${1}".format(gname, p).decode("utf-8") else: return p.decode("utf-8") def getjson (url, max_retry=3, retry_sleep_time=3): ret = {} ret["_retries"] = 0 while ret["_retries"] <= max_retry: try: f = urlopen(url) data = f.read() data = data.decode("utf-8") rurl = f.geturl() f.close() ret.update(json.loads(data)) ret["_code"] = f.getcode() if rurl != url: ret["_url"] = rurl return ret except ValueError as e: url = "http://localhost" + url except HTTPError as e: print ("HTTPError {0}".format(e), file=sys.stderr) ret["_code"] = e.code ret["_retries"]+=1 if retry_sleep_time: sleep(retry_sleep_time) return ret def loadpadinfo(p): with open(p) as f: info = json.load(f) return info def progressbar (i, num, label="", file=sys.stderr): p = float(i) / num percentage = int(floor(p*100)) bars = int(ceil(p*20)) bar = ("*"*bars) + ("-"*(20-bars)) msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label) sys.stderr.write(msg) sys.stderr.flush() # Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities: ## # Removes HTML or XML character references and entities from a text string. # # @param text The HTML (or XML) source text. # @return The plain text, as a Unicode string, if necessary. def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)