etherdump post-pipelines version
This commit is contained in:
parent
c4e3009285
commit
d89c5dbd3c
100
etherdump
100
etherdump
@ -10,13 +10,87 @@ from datetime import datetime
|
|||||||
from xml.etree import cElementTree as ET
|
from xml.etree import cElementTree as ET
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from urllib2 import urlopen, HTTPError, URLError
|
from urllib2 import urlopen, HTTPError, URLError
|
||||||
# local mods
|
|
||||||
from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
|
|
||||||
from linkify import linkify, urlify, filename_to_padid
|
|
||||||
# external dependencies (use pip to install these)
|
# external dependencies (use pip to install these)
|
||||||
import html5lib, jinja2
|
import html5lib, jinja2
|
||||||
|
|
||||||
|
|
||||||
|
def filename_to_padid (t):
|
||||||
|
t = t.replace("_", " ")
|
||||||
|
t = re.sub(r"\.html$", "", t)
|
||||||
|
return t
|
||||||
|
|
||||||
|
def normalize_pad_name (n):
|
||||||
|
if '?' in n:
|
||||||
|
n = n.split('?', 1)[0]
|
||||||
|
if '/' in n:
|
||||||
|
n = n.split('/', 1)[0]
|
||||||
|
return n
|
||||||
|
|
||||||
|
def urlify (t, ext=".html"):
|
||||||
|
return t.replace(" ", "_") + ext
|
||||||
|
|
||||||
|
def linkify (src, urlify=urlify):
|
||||||
|
|
||||||
|
collect = []
|
||||||
|
|
||||||
|
def s (m):
|
||||||
|
contents = strip_tags(m.group(1))
|
||||||
|
contents = normalize_pad_name(contents)
|
||||||
|
collect.append(contents)
|
||||||
|
link = urlify(contents)
|
||||||
|
# link = link.split("?", 1)[0]
|
||||||
|
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
||||||
|
|
||||||
|
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
|
||||||
|
## question marks are ignored by etherpad, so split/strip it
|
||||||
|
## strip slashes as well!! (/timeslider)
|
||||||
|
src = re.sub(r"\[\[(.+?)\]\]", s, src)
|
||||||
|
return (src, collect)
|
||||||
|
|
||||||
|
def strip_tags (text):
|
||||||
|
return re.sub(r"<.*?>", "", text)
|
||||||
|
|
||||||
|
def set_text_contents (element, text):
|
||||||
|
""" ok this isn't really general, but works for singly wrapped elements """
|
||||||
|
while len(element) == 1:
|
||||||
|
element = element[0]
|
||||||
|
element.text = text
|
||||||
|
|
||||||
|
def text_contents (element):
|
||||||
|
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
|
||||||
|
|
||||||
|
def contents (element, method="html"):
|
||||||
|
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
||||||
|
|
||||||
|
def get_parent(tree, elt):
|
||||||
|
for parent in tree.iter():
|
||||||
|
for child in parent:
|
||||||
|
if child == elt:
|
||||||
|
return parent
|
||||||
|
|
||||||
|
def remove_recursive (tree, elt):
|
||||||
|
""" Remove element and (any resulting) empty containing elements """
|
||||||
|
p = get_parent(tree, elt)
|
||||||
|
if p:
|
||||||
|
p.remove(elt)
|
||||||
|
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
|
||||||
|
# print ("empty parent", p, file=sys.stderr)
|
||||||
|
remove_recursive(tree, p)
|
||||||
|
|
||||||
|
|
||||||
|
def trim_removed_spans (t):
|
||||||
|
# remove <span class="removed"> and empty parents
|
||||||
|
for n in t.findall(".//span[@class='removed']"):
|
||||||
|
remove_recursive(t, n)
|
||||||
|
# then strip any leading br's from body
|
||||||
|
while True:
|
||||||
|
tag = t.find("./body")[0]
|
||||||
|
if tag.tag == "br":
|
||||||
|
remove_recursive(t, tag)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
def get_template_env (tpath=None):
|
def get_template_env (tpath=None):
|
||||||
paths = []
|
paths = []
|
||||||
if tpath and os.path.isdir(tpath):
|
if tpath and os.path.isdir(tpath):
|
||||||
@ -114,7 +188,13 @@ while len(todo) > 0:
|
|||||||
# | | | | | | __/ || (_| |
|
# | | | | | | __/ || (_| |
|
||||||
# |_| |_| |_|\___|\__\__,_|
|
# |_| |_| |_|\___|\__\__,_|
|
||||||
|
|
||||||
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
|
meta_url = urlify(padid, ext=".json")
|
||||||
|
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
|
||||||
|
raw_url = urlify(padid, ext=".txt")
|
||||||
|
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
|
||||||
|
colors_url = urlify(padid, ext=".html")
|
||||||
|
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
|
||||||
|
|
||||||
if not args.hidepaths:
|
if not args.hidepaths:
|
||||||
print (meta_out, file=sys.stderr)
|
print (meta_out, file=sys.stderr)
|
||||||
if not args.pretend:
|
if not args.pretend:
|
||||||
@ -137,7 +217,9 @@ while len(todo) > 0:
|
|||||||
if args.showurls:
|
if args.showurls:
|
||||||
print (authors_url, file=sys.stderr)
|
print (authors_url, file=sys.stderr)
|
||||||
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
|
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
|
||||||
|
meta['colors'] = colors_url
|
||||||
|
meta['raw'] = raw_url
|
||||||
|
meta['meta'] = meta_url
|
||||||
with open(meta_out, "w") as f:
|
with open(meta_out, "w") as f:
|
||||||
json.dump(meta, f)
|
json.dump(meta, f)
|
||||||
|
|
||||||
@ -146,7 +228,6 @@ while len(todo) > 0:
|
|||||||
# | | | (_| |\ V V /
|
# | | | (_| |\ V V /
|
||||||
# |_| \__,_| \_/\_/
|
# |_| \__,_| \_/\_/
|
||||||
|
|
||||||
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
|
|
||||||
if not args.hidepaths:
|
if not args.hidepaths:
|
||||||
print (raw_out, file=sys.stderr)
|
print (raw_out, file=sys.stderr)
|
||||||
text_url = apiurl+"getText?"+urlencode(data)
|
text_url = apiurl+"getText?"+urlencode(data)
|
||||||
@ -171,7 +252,6 @@ while len(todo) > 0:
|
|||||||
# | (_| (_) | | (_) | | \__ \
|
# | (_| (_) | | (_) | | \__ \
|
||||||
# \___\___/|_|\___/|_| |___/
|
# \___\___/|_|\___/|_| |___/
|
||||||
|
|
||||||
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
|
|
||||||
if not args.hidepaths:
|
if not args.hidepaths:
|
||||||
print (colors_out, file=sys.stderr)
|
print (colors_out, file=sys.stderr)
|
||||||
data['startRev'] = "0"
|
data['startRev'] = "0"
|
||||||
@ -272,7 +352,11 @@ while len(todo) > 0:
|
|||||||
style = style,
|
style = style,
|
||||||
revision = meta['total_revisions'],
|
revision = meta['total_revisions'],
|
||||||
padid = padid,
|
padid = padid,
|
||||||
timestamp = datetime.now()
|
timestamp = datetime.now(),
|
||||||
|
meta_url = meta_url,
|
||||||
|
raw_url = raw_url,
|
||||||
|
colors_url = colors_url,
|
||||||
|
lastedited = meta['lastedited']
|
||||||
).encode("utf-8"))
|
).encode("utf-8"))
|
||||||
|
|
||||||
# _
|
# _
|
||||||
|
Loading…
Reference in New Issue
Block a user