etherpump/etherdump/commands/common.py


								from __future__ import print_function

								import re, os, json, sys

								from math import ceil, floor

								from time import sleep


								try:

								    # python2

								    from urlparse import urlparse, urlunparse

								    from urllib2 import urlopen, URLError, HTTPError

								    from urllib import urlencode

								    from urllib import quote_plus, unquote_plus

								    from htmlentitydefs import name2codepoint


								    input = raw_input

								except ImportError:

								    # python3

								    from urllib.parse import urlparse, urlunparse, urlencode, quote_plus, unquote_plus

								    from urllib.request import urlopen, URLError, HTTPError

								    from html.entities import name2codepoint


								groupnamepat = re.compile(r"^g\.(\w+)\$")

								def splitpadname (padid):

								    m = groupnamepat.match(padid)

								    if m:

								        return(m.group(1), padid[m.end():])

								    else:

								        return (u"", padid)


								def padurl (padid, ):

								    return padid


								def padpath (padid, pub_path=u"", group_path=u"", normalize=False):

								    g, p = splitpadname(padid)

								    # if type(g) == unicode:

								    #     g = g.encode("utf-8")

								    # if type(p) == unicode:

								    #     p = p.encode("utf-8")

								    p = quote_plus(p)

								    if normalize:

								        p = p.replace(" ", "_")

								        p = p.replace("(", "")

								        p = p.replace(")", "")

								        p = p.replace("?", "")

								        p = p.replace("'", "")

								    if g:

								        return os.path.join(group_path, g, p)

								    else:

								        return os.path.join(pub_path, p)


								def padpath2id (path):

								    if type(path) == unicode:

								        path = path.encode("utf-8")

								    dd, p = os.path.split(path)

								    gname = dd.split("/")[-1]

								    p = unquote_plus(p)

								    if gname:

								        return "{0}${1}".format(gname, p).decode("utf-8")

								    else:

								        return p.decode("utf-8")


								def getjson (url, max_retry=3, retry_sleep_time=3):

								    ret = {}

								    ret["_retries"] = 0

								    while ret["_retries"] <= max_retry:

								        try:

								            f = urlopen(url)

								            data = f.read()

								            data = data.decode("utf-8")

								            rurl = f.geturl()

								            f.close()

								            ret.update(json.loads(data))

								            ret["_code"] = f.getcode()

								            if rurl != url:

								                ret["_url"] = rurl

								            return ret

								        except ValueError as e:

								            url = "http://localhost" + url

								        except HTTPError as e:

								            print ("HTTPError {0}".format(e), file=sys.stderr)

								            ret["_code"] = e.code

								            ret["_retries"]+=1

								            if retry_sleep_time:

								                sleep(retry_sleep_time)

								    return ret


								def loadpadinfo(p):

								    with open(p) as f:

								        info = json.load(f)

								        if 'localapiurl' not in info:

								            info['localapiurl'] = info.get('apiurl')

								    return info


								def progressbar (i, num, label="", file=sys.stderr):

								    p = float(i) / num

								    percentage = int(floor(p*100))

								    bars = int(ceil(p*20))

								    bar = ("*"*bars) + ("-"*(20-bars))

								    msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)

								    sys.stderr.write(msg)

								    sys.stderr.flush()


								# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:

								##

								# Removes HTML or XML character references and entities from a text string.

								#

								# @param text The HTML (or XML) source text.

								# @return The plain text, as a Unicode string, if necessary.

								def unescape(text):

								    def fixup(m):

								        text = m.group(0)

								        if text[:2] == "&#":

								            # character reference

								            try:

								                if text[:3] == "&#x":

								                    return unichr(int(text[3:-1], 16))

								                else:

								                    return unichr(int(text[2:-1]))

								            except ValueError:

								                pass

								        else:

								            # named entity

								            try:

								                text = unichr(name2codepoint[text[1:-1]])

								            except KeyError:

								                pass

								        return text # leave as is

								    return re.sub("&#?\w+;", fixup, text)