Pumping pads as files into publishing frameworks!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

128 lines
3.7 KiB

from __future__ import print_function
import re, os, json, sys
from math import ceil, floor
from time import sleep
try:
# python2
from urlparse import urlparse, urlunparse
from urllib2 import urlopen, URLError, HTTPError
from urllib import urlencode
from urllib import quote_plus, unquote_plus
from htmlentitydefs import name2codepoint
input = raw_input
except ImportError:
# python3
from urllib.parse import urlparse, urlunparse, urlencode, quote_plus, unquote_plus
from urllib.request import urlopen, URLError, HTTPError
from html.entities import name2codepoint
groupnamepat = re.compile(r"^g\.(\w+)\$")
def splitpadname (padid):
m = groupnamepat.match(padid)
if m:
return(m.group(1), padid[m.end():])
else:
return (u"", padid)
def padurl (padid, ):
return padid
def padpath (padid, pub_path=u"", group_path=u"", normalize=False):
g, p = splitpadname(padid)
# if type(g) == unicode:
# g = g.encode("utf-8")
# if type(p) == unicode:
# p = p.encode("utf-8")
p = quote_plus(p)
if normalize:
p = p.replace(" ", "_")
p = p.replace("(", "")
p = p.replace(")", "")
p = p.replace("?", "")
p = p.replace("'", "")
if g:
return os.path.join(group_path, g, p)
else:
return os.path.join(pub_path, p)
def padpath2id (path):
if type(path) == unicode:
path = path.encode("utf-8")
dd, p = os.path.split(path)
gname = dd.split("/")[-1]
p = unquote_plus(p)
if gname:
return "{0}${1}".format(gname, p).decode("utf-8")
else:
return p.decode("utf-8")
def getjson (url, max_retry=3, retry_sleep_time=3):
ret = {}
ret["_retries"] = 0
while ret["_retries"] <= max_retry:
try:
f = urlopen(url)
data = f.read()
data = data.decode("utf-8")
rurl = f.geturl()
f.close()
ret.update(json.loads(data))
ret["_code"] = f.getcode()
if rurl != url:
ret["_url"] = rurl
return ret
except ValueError as e:
url = "http://localhost" + url
except HTTPError as e:
print ("HTTPError {0}".format(e), file=sys.stderr)
ret["_code"] = e.code
ret["_retries"]+=1
if retry_sleep_time:
sleep(retry_sleep_time)
return ret
def loadpadinfo(p):
with open(p) as f:
info = json.load(f)
return info
def progressbar (i, num, label="", file=sys.stderr):
p = float(i) / num
percentage = int(floor(p*100))
bars = int(ceil(p*20))
bar = ("*"*bars) + ("-"*(20-bars))
msg = u"\r{0} {1}/{2} {3}... ".format(bar, (i+1), num, label)
sys.stderr.write(msg)
sys.stderr.flush()
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)