etherpump/etherdump/commands/common.py

from __future__ import print_function
import re, os, json, sys
from math import ceil, floor
from time import sleep

try:
    # python2
    from urlparse import urlparse, urlunparse
    from urllib2 import urlopen, URLError, HTTPError
    from urllib import urlencode
    from urllib import quote_plus, unquote_plus
    from htmlentitydefs import name2codepoint

    input = raw_input
except ImportError:
    # python3
    from urllib.parse import urlparse, urlunparse, urlencode, quote_plus, unquote_plus
    from urllib.request import urlopen, URLError, HTTPError
    from html.entities import name2codepoint

groupnamepat = re.compile(r"^g\.(\w+)\$")
def splitpadname (padid):
    m = groupnamepat.match(padid)
    if m:
        return(m.group(1), padid[m.end():])
    else:
        return (u"", padid)

def padurl (padid, ):
    return padid

def padpath (padid, pub_path=u"", group_path=u"", normalize=False):
    g, p = splitpadname(padid)
    # if type(g) == unicode:
    #     g = g.encode("utf-8")
    # if type(p) == unicode:
    #     p = p.encode("utf-8")
    p = quote_plus(p)
    if normalize:
        p = p.replace(" ", "_")
        p = p.replace("(", "")
        p = p.replace(")", "")
        p = p.replace("?", "")
        p = p.replace("'", "")
    if g:
        return os.path.join(group_path, g, p)
    else:
        return os.path.join(pub_path, p)

def padpath2id (path):
    if type(path) == unicode:
        path = path.encode("utf-8")
    dd, p = os.path.split(path)
    gname = dd.split("/")[-1]
    p = unquote_plus(p)
    if gname:
        return "{0}${1}".format(gname, p).decode("utf-8")
    else:
        return p.decode("utf-8")

def getjson (url, max_retry=3, retry_sleep_time=3):
    ret = {}
    ret["_retries"] = 0
    while ret["_retries"] <= max_retry:
        try:
            f = urlopen(url)
            data = f.read()
            data = data.decode("utf-8")
            rurl = f.geturl()
            f.close()
            ret.update(json.loads(data))
            ret["_code"] = f.getcode()
            if rurl != url:
                ret["_url"] = rurl
            return ret
        except ValueError as e:
            url = "http://localhost" + url
        except HTTPError as e:
            print ("HTTPError {0}".format(e), file=sys.stderr)
            ret["_code"] = e.code
            ret["_retries"]+=1
            if retry_sleep_time:
                sleep(retry_sleep_time)
    return ret

def loadpadinfo(p):
    with open(p) as f:
        info = json.load(f)
    return info

def progressbar (i, num, label="", file=sys.stderr):
    p = float(i) / num
    percentage = int(floor(p*100))
    bars = int(ceil(p*20))
    bar = ("*"*bars) + ("-"*(20-bars))
    msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
    sys.stderr.write(msg)
    sys.stderr.flush()


# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)
new pull, new meta style from live constant etherdumpÄ 9 years ago			`from __future__ import print_function`
changes 9 years ago			`import re, os, json, sys`
			`from math import ceil, floor`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`from time import sleep`

python3 7 years ago			`try:`
			`# python2`
			`from urlparse import urlparse, urlunparse`
			`from urllib2 import urlopen, URLError, HTTPError`
			`from urllib import urlencode`
			`from urllib import quote_plus, unquote_plus`
			`from htmlentitydefs import name2codepoint`

			`input = raw_input`
			`except ImportError:`
			`# python3`
			`from urllib.parse import urlparse, urlunparse, urlencode, quote_plus, unquote_plus`
			`from urllib.request import urlopen, URLError, HTTPError`
			`from html.entities import name2codepoint`
make file friendliness 9 years ago
			`groupnamepat = re.compile(r"^g\.(\w+)\$")`
			`def splitpadname (padid):`
			`m = groupnamepat.match(padid)`
			`if m:`
			`return(m.group(1), padid[m.end():])`
			`else:`
			`return (u"", padid)`

new 9 years ago			`def padurl (padid, ):`
			`return padid`

python3 7 years ago			`def padpath (padid, pub_path=u"", group_path=u"", normalize=False):`
make file friendliness 9 years ago			`g, p = splitpadname(padid)`
python3 7 years ago			`# if type(g) == unicode:`
			`# g = g.encode("utf-8")`
			`# if type(p) == unicode:`
			`# p = p.encode("utf-8")`
make file friendliness 9 years ago			`p = quote_plus(p)`
python3 7 years ago			`if normalize:`
			`p = p.replace(" ", "_")`
			`p = p.replace("(", "")`
			`p = p.replace(")", "")`
			`p = p.replace("?", "")`
			`p = p.replace("'", "")`
make file friendliness 9 years ago			`if g:`
			`return os.path.join(group_path, g, p)`
			`else:`
			`return os.path.join(pub_path, p)`
changes 9 years ago
			`def padpath2id (path):`
			`if type(path) == unicode:`
			`path = path.encode("utf-8")`
			`dd, p = os.path.split(path)`
			`gname = dd.split("/")[-1]`
			`p = unquote_plus(p)`
			`if gname:`
			`return "{0}${1}".format(gname, p).decode("utf-8")`
			`else:`
			`return p.decode("utf-8")`

upped sleep time on 502 in getjson to 3 sec, trying to prevent cascading fail 7 years ago			`def getjson (url, max_retry=3, retry_sleep_time=3):`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`ret = {}`
			`ret["_retries"] = 0`
			`while ret["_retries"] <= max_retry:`
			`try:`
			`f = urlopen(url)`
			`data = f.read()`
python3 7 years ago			`data = data.decode("utf-8")`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`rurl = f.geturl()`
			`f.close()`
			`ret.update(json.loads(data))`
			`ret["_code"] = f.getcode()`
			`if rurl != url:`
			`ret["_url"] = rurl`
			`return ret`
support partial url 9 years ago			`except ValueError as e:`
			`url = "http://localhost" + url`
new pull, new meta style from live constant etherdumpÄ 9 years ago			`except HTTPError as e:`
			`print ("HTTPError {0}".format(e), file=sys.stderr)`
			`ret["_code"] = e.code`
			`ret["_retries"]+=1`
			`if retry_sleep_time:`
			`sleep(retry_sleep_time)`
			`return ret`
changes 9 years ago
			`def loadpadinfo(p):`
			`with open(p) as f:`
			`info = json.load(f)`
			`return info`

			`def progressbar (i, num, label="", file=sys.stderr):`
			`p = float(i) / num`
			`percentage = int(floor(p*100))`
			`bars = int(ceil(p*20))`
			`bar = (""bars) + ("-"*(20-bars))`
			`msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)`
python3 7 years ago			`sys.stderr.write(msg)`
changes 9 years ago			`sys.stderr.flush()`
fixed vital encoding error in pad urls 9 years ago


			`# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:`
			`##`
			`# Removes HTML or XML character references and entities from a text string.`
			`#`
			`# @param text The HTML (or XML) source text.`
			`# @return The plain text, as a Unicode string, if necessary.`
			`def unescape(text):`
			`def fixup(m):`
			`text = m.group(0)`
			`if text[:2] == "&#":`
			`# character reference`
			`try:`
			`if text[:3] == "&#x":`
			`return unichr(int(text[3:-1], 16))`
			`else:`
			`return unichr(int(text[2:-1]))`
			`except ValueError:`
			`pass`
			`else:`
			`# named entity`
			`try:`
python3 7 years ago			`text = unichr(name2codepoint[text[1:-1]])`
fixed vital encoding error in pad urls 9 years ago			`except KeyError:`
			`pass`
			`return text # leave as is`
			`return re.sub("&#?\w+;", fixup, text)`