import json import os import re import sys from html.entities import name2codepoint from math import ceil, floor from time import sleep from urllib.parse import ( quote_plus, unquote_plus, urlencode, urlparse, urlunparse, ) from urllib.request import HTTPError, URLError, urlopen groupnamepat = re.compile(r"^g\.(\w+)\$") def splitpadname(padid): m = groupnamepat.match(padid) if m: return (, padid[m.end() :]) else: return ("", padid) def padurl(padid,): return padid def padpath(padid, pub_path="", group_path="", normalize=False): g, p = splitpadname(padid) p = quote_plus(p) if normalize: p = p.replace(" ", "_") p = p.replace("(", "") p = p.replace(")", "") p = p.replace("?", "") p = p.replace("'", "") if g: return os.path.join(group_path, g, p) else: return os.path.join(pub_path, p) def padpath2id(path): if type(path) == str: path = path.encode("utf-8") dd, p = os.path.split(path) gname = dd.split("/")[-1] p = unquote_plus(p) if gname: return "{0}${1}".format(gname, p).decode("utf-8") else: return p.decode("utf-8") def getjson(url, max_retry=3, retry_sleep_time=3): ret = {} ret["_retries"] = 0 while ret["_retries"] <= max_retry: try: f = urlopen(url) data = data = data.decode("utf-8") rurl = f.geturl() f.close() ret.update(json.loads(data)) ret["_code"] = f.getcode() if rurl != url: ret["_url"] = rurl return ret except ValueError as e: url = "http://localhost" + url except HTTPError as e: print("HTTPError {0}".format(e), file=sys.stderr) ret["_code"] = e.code ret["_retries"] += 1 if retry_sleep_time: sleep(retry_sleep_time) return ret def loadpadinfo(p): with open(p) as f: info = json.load(f) if 'localapiurl' not in info: info['localapiurl'] = info.get('apiurl') return info def progressbar(i, num, label="", file=sys.stderr): p = float(i) / num percentage = int(floor(p * 100)) bars = int(ceil(p * 20)) bar = ("*" * bars) + ("-" * (20 - bars)) msg = "\r{0} {1}/{2} {3}... ".format(bar, (i + 1), num, label) sys.stderr.write(msg) sys.stderr.flush() # Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities: ## # Removes HTML or XML character references and entities from a text string. # # @param text The HTML (or XML) source text. # @return The plain text, as a Unicode string, if necessary. def unescape(text): def fixup(m): text = if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = chr(name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)