etherdump post-pipelines version

2015-09-17 11:34:34 +02:00 · 2015-09-17 11:34:34 +02:00 · d89c5dbd3c
commit d89c5dbd3c
parent c4e3009285
1 changed files with 92 additions and 8 deletions
--- a/100
+++ b/100
@ -10,13 +10,87 @@ from datetime import datetime
 from xml.etree import cElementTree as ET 
 from urllib import urlencode
 from urllib2 import urlopen, HTTPError, URLError
-# local mods
-from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
-from linkify import linkify, urlify, filename_to_padid
+
 # external dependencies (use pip to install these)
 import html5lib, jinja2


+def filename_to_padid (t):
+    t = t.replace("_", " ")
+    t = re.sub(r"\.html$", "", t)
+    return t
+
+def normalize_pad_name (n):
+    if '?' in n:
+        n = n.split('?', 1)[0]
+    if '/' in n:
+        n = n.split('/', 1)[0]
+    return n
+
+def urlify (t, ext=".html"):
+    return t.replace(" ", "_") + ext
+
+def linkify (src, urlify=urlify):
+
+    collect = []
+
+    def s (m):
+        contents = strip_tags(m.group(1))
+        contents = normalize_pad_name(contents)
+        collect.append(contents)
+        link = urlify(contents)
+        # link = link.split("?", 1)[0]
+        return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
+
+    # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
+    ## question marks are ignored by etherpad, so split/strip it
+    ## strip slashes as well!! (/timeslider)
+    src = re.sub(r"\[\[(.+?)\]\]", s, src)
+    return (src, collect)
+
+def strip_tags (text):
+    return re.sub(r"<.*?>", "", text)
+
+def set_text_contents (element, text):
+    """ ok this isn't really general, but works for singly wrapped elements """
+    while len(element) == 1:
+        element = element[0]
+    element.text = text
+
+def text_contents (element):
+    return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
+
+def contents (element, method="html"):
+    return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
+
+def get_parent(tree, elt):
+    for parent in tree.iter():
+        for child in parent:
+            if child == elt:
+                return parent
+
+def remove_recursive (tree, elt):
+    """ Remove element and (any resulting) empty containing elements """
+    p = get_parent(tree, elt)
+    if p:
+        p.remove(elt)
+        if len(p) == 0 and (p.text == None or p.text.strip() == ""):
+            # print ("empty parent", p, file=sys.stderr)
+            remove_recursive(tree, p)
+
+
+def trim_removed_spans (t):
+    # remove <span class="removed"> and empty parents
+    for n in t.findall(".//span[@class='removed']"):
+       remove_recursive(t, n)
+    # then strip any leading br's from body
+    while True:
+        tag = t.find("./body")[0]
+        if tag.tag == "br":
+            remove_recursive(t, tag)
+        else:
+            break
+
 def get_template_env (tpath=None):
    paths = []
    if tpath and os.path.isdir(tpath):
@ -114,7 +188,13 @@ while len(todo) > 0:
        # | | | | | |  __/ || (_| |
        # |_| |_| |_|\___|\__\__,_|

-        meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
+        meta_url = urlify(padid, ext=".json")
+        meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
+        raw_url = urlify(padid, ext=".txt")
+        raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
+        colors_url = urlify(padid, ext=".html")
+        colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
+
        if not args.hidepaths:
            print (meta_out, file=sys.stderr)
        if not args.pretend:
@ -137,7 +217,9 @@ while len(todo) > 0:
            if args.showurls:
                print (authors_url, file=sys.stderr)
            meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
-
+            meta['colors'] = colors_url
+            meta['raw'] = raw_url
+            meta['meta'] = meta_url
            with open(meta_out, "w") as f:
                json.dump(meta, f)

@ -146,7 +228,6 @@ while len(todo) > 0:
        # | | | (_| |\ V  V / 
        # |_|  \__,_| \_/\_/  

-        raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
        if not args.hidepaths:
            print (raw_out, file=sys.stderr)
        text_url = apiurl+"getText?"+urlencode(data)
@ -171,7 +252,6 @@ while len(todo) > 0:
        # | (_| (_) | | (_) | |  \__ \
        #  \___\___/|_|\___/|_|  |___/

-        colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
        if not args.hidepaths:
            print (colors_out, file=sys.stderr)
        data['startRev'] = "0"
@ -272,7 +352,11 @@ while len(todo) > 0:
                    style = style,
                    revision = meta['total_revisions'],
                    padid = padid,
-                    timestamp = datetime.now()
+                    timestamp = datetime.now(),
+                    meta_url = meta_url,
+                    raw_url = raw_url,
+                    colors_url = colors_url,
+                    lastedited = meta['lastedited']
                ).encode("utf-8"))

        #  _