etherdump post-pipelines version

This commit is contained in:
Michael Murtaugh 2015-09-17 11:34:34 +02:00
parent c4e3009285
commit d89c5dbd3c

100
etherdump
View File

@ -10,13 +10,87 @@ from datetime import datetime
from xml.etree import cElementTree as ET from xml.etree import cElementTree as ET
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
# local mods
from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
# external dependencies (use pip to install these) # external dependencies (use pip to install these)
import html5lib, jinja2 import html5lib, jinja2
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def normalize_pad_name (n):
if '?' in n:
n = n.split('?', 1)[0]
if '/' in n:
n = n.split('/', 1)[0]
return n
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = strip_tags(m.group(1))
contents = normalize_pad_name(contents)
collect.append(contents)
link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
p.remove(elt)
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
else:
break
def get_template_env (tpath=None): def get_template_env (tpath=None):
paths = [] paths = []
if tpath and os.path.isdir(tpath): if tpath and os.path.isdir(tpath):
@ -114,7 +188,13 @@ while len(todo) > 0:
# | | | | | | __/ || (_| | # | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_| # |_| |_| |_|\___|\__\__,_|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) meta_url = urlify(padid, ext=".json")
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_url = urlify(padid, ext=".txt")
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
colors_url = urlify(padid, ext=".html")
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
if not args.hidepaths: if not args.hidepaths:
print (meta_out, file=sys.stderr) print (meta_out, file=sys.stderr)
if not args.pretend: if not args.pretend:
@ -137,7 +217,9 @@ while len(todo) > 0:
if args.showurls: if args.showurls:
print (authors_url, file=sys.stderr) print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
meta['colors'] = colors_url
meta['raw'] = raw_url
meta['meta'] = meta_url
with open(meta_out, "w") as f: with open(meta_out, "w") as f:
json.dump(meta, f) json.dump(meta, f)
@ -146,7 +228,6 @@ while len(todo) > 0:
# | | | (_| |\ V V / # | | | (_| |\ V V /
# |_| \__,_| \_/\_/ # |_| \__,_| \_/\_/
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
if not args.hidepaths: if not args.hidepaths:
print (raw_out, file=sys.stderr) print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data) text_url = apiurl+"getText?"+urlencode(data)
@ -171,7 +252,6 @@ while len(todo) > 0:
# | (_| (_) | | (_) | | \__ \ # | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/ # \___\___/|_|\___/|_| |___/
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
if not args.hidepaths: if not args.hidepaths:
print (colors_out, file=sys.stderr) print (colors_out, file=sys.stderr)
data['startRev'] = "0" data['startRev'] = "0"
@ -272,7 +352,11 @@ while len(todo) > 0:
style = style, style = style,
revision = meta['total_revisions'], revision = meta['total_revisions'],
padid = padid, padid = padid,
timestamp = datetime.now() timestamp = datetime.now(),
meta_url = meta_url,
raw_url = raw_url,
colors_url = colors_url,
lastedited = meta['lastedited']
).encode("utf-8")) ).encode("utf-8"))
# _ # _