etherdump post-pipelines version

2015-09-17 11:34:34 +02:00 · 2015-09-17 11:34:34 +02:00 · d89c5dbd3c
commit d89c5dbd3c
parent c4e3009285
1 changed files with 92 additions and 8 deletions
--- a/100
+++ b/100
@ -10,13 +10,87 @@ from datetime import datetime
 from xml.etree import cElementTree as ET 
 from urllib import urlencode
 from urllib2 import urlopen, HTTPError, URLError
-# local mods
+
 from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
 from linkify import linkify, urlify, filename_to_padid
 # external dependencies (use pip to install these)
 import html5lib, jinja2
 def filename_to_padid (t):
    t = t.replace("_", " ")
    t = re.sub(r"\.html$", "", t)
    return t
 def normalize_pad_name (n):
    if '?' in n:
        n = n.split('?', 1)[0]
    if '/' in n:
        n = n.split('/', 1)[0]
    return n
 def urlify (t, ext=".html"):
    return t.replace(" ", "_") + ext
 def linkify (src, urlify=urlify):
    collect = []
    def s (m):
        contents = strip_tags(m.group(1))
        contents = normalize_pad_name(contents)
        collect.append(contents)
        link = urlify(contents)
        # link = link.split("?", 1)[0]
        return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
    # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
    ## question marks are ignored by etherpad, so split/strip it
    ## strip slashes as well!! (/timeslider)
    src = re.sub(r"\[\[(.+?)\]\]", s, src)
    return (src, collect)
 def strip_tags (text):
    return re.sub(r"<.*?>", "", text)
 def set_text_contents (element, text):
    """ ok this isn't really general, but works for singly wrapped elements """
    while len(element) == 1:
        element = element[0]
    element.text = text
 def text_contents (element):
    return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
 def contents (element, method="html"):
    return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
 def get_parent(tree, elt):
    for parent in tree.iter():
        for child in parent:
            if child == elt:
                return parent
 def remove_recursive (tree, elt):
    """ Remove element and (any resulting) empty containing elements """
    p = get_parent(tree, elt)
    if p:
        p.remove(elt)
        if len(p) == 0 and (p.text == None or p.text.strip() == ""):
            # print ("empty parent", p, file=sys.stderr)
            remove_recursive(tree, p)
 def trim_removed_spans (t):
    # remove <span class="removed"> and empty parents
    for n in t.findall(".//span[@class='removed']"):
       remove_recursive(t, n)
    # then strip any leading br's from body
    while True:
        tag = t.find("./body")[0]
        if tag.tag == "br":
            remove_recursive(t, tag)
        else:
            break
 def get_template_env (tpath=None):
    paths = []
    if tpath and os.path.isdir(tpath):
@ -114,7 +188,13 @@ while len(todo) > 0:
        # | | | | | |  __/ || (_| |
        # |_| |_| |_|\___|\__\__,_|
-        meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
+        meta_url = urlify(padid, ext=".json")
        meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
        raw_url = urlify(padid, ext=".txt")
        raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
        colors_url = urlify(padid, ext=".html")
        colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
        if not args.hidepaths:
            print (meta_out, file=sys.stderr)
        if not args.pretend:
@ -137,7 +217,9 @@ while len(todo) > 0:
            if args.showurls:
                print (authors_url, file=sys.stderr)
            meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
-
+            meta['colors'] = colors_url
            meta['raw'] = raw_url
            meta['meta'] = meta_url
            with open(meta_out, "w") as f:
                json.dump(meta, f)
@ -146,7 +228,6 @@ while len(todo) > 0:
        # | | | (_| |\ V  V / 
        # |_|  \__,_| \_/\_/  
        raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
        if not args.hidepaths:
            print (raw_out, file=sys.stderr)
        text_url = apiurl+"getText?"+urlencode(data)
@ -171,7 +252,6 @@ while len(todo) > 0:
        # | (_| (_) | | (_) | |  \__ \
        #  \___\___/|_|\___/|_|  |___/
        colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
        if not args.hidepaths:
            print (colors_out, file=sys.stderr)
        data['startRev'] = "0"
@ -272,7 +352,11 @@ while len(todo) > 0:
                    style = style,
                    revision = meta['total_revisions'],
                    padid = padid,
-                    timestamp = datetime.now()
+                    timestamp = datetime.now(),
                    meta_url = meta_url,
                    raw_url = raw_url,
                    colors_url = colors_url,
                    lastedited = meta['lastedited']
                ).encode("utf-8"))
        #  _