From 33258cdcb1298c1a221232b8908ef9c19e588a14 Mon Sep 17 00:00:00 2001 From: Michael Murtaugh Date: Thu, 17 Sep 2015 11:35:02 +0200 Subject: [PATCH] removed external routines, now part of etherdump script itself --- et_helpers.py | 63 --------------------------------------------------- linkify.py | 49 --------------------------------------- 2 files changed, 112 deletions(-) delete mode 100644 et_helpers.py delete mode 100644 linkify.py diff --git a/et_helpers.py b/et_helpers.py deleted file mode 100644 index 912fe1b..0000000 --- a/et_helpers.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import print_function -import html5lib, sys, re -from xml.etree import cElementTree as ET - - -def contents (element, method="html"): - return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) - -def text_contents (element): - return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') - -def set_text_contents (element, text): - """ ok this isn't really general, but works for singly wrapped elements """ - while len(element) == 1: - element = element[0] - element.text = text - -def iterparent(tree): - for parent in tree.iter(): - for child in parent: - yield parent, child - -def get_parent(tree, elt): - for parent in tree.iter(): - for child in parent: - if child == elt: - return parent - -def remove_recursive (tree, elt): - """ Remove element and (any resulting) empty containing elements """ - p = get_parent(tree, elt) - if p: - p.remove(elt) - if len(p) == 0 and (p.text == None or p.text.strip() == ""): - # print ("empty parent", p, file=sys.stderr) - remove_recursive(tree, p) - - -def trim_removed_spans (t): - # remove and empty parents - for n in t.findall(".//span[@class='removed']"): - remove_recursive(t, n) - # then strip any leading br's from body - while True: - tag = t.find("./body")[0] - if tag.tag == "br": - remove_recursive(t, tag) - else: - break - -def trim_removed_spans_src (src): - t = html5lib.parse(src, namespaceHTMLElements=False) - trim_removed_spans(t) - return contents(t.find("./body")) - - -if __name__ == "__main__": - src = sys.stdin.read() - # t = html5lib.parse(src, namespaceHTMLElements=False) - # trim_rems_tree(t) - # print (ET.tostring(t)) - print (trim_removed_spans_src(src).encode("utf-8")) - diff --git a/linkify.py b/linkify.py deleted file mode 100644 index 981af11..0000000 --- a/linkify.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import print_function -import re, sys - - -def strip_tags (text): - return re.sub(r"<.*?>", "", text) - -def urlify (t, ext=".html"): - return t.replace(" ", "_") + ext - -def filename_to_padid (t): - t = t.replace("_", " ") - t = re.sub(r"\.html$", "", t) - return t - -def normalize_pad_name (n): - if '?' in n: - n = n.split('?', 1)[0] - if '/' in n: - n = n.split('/', 1)[0] - return n - -def linkify (src, urlify=urlify): - - collect = [] - - def s (m): - contents = strip_tags(m.group(1)) - contents = normalize_pad_name(contents) - collect.append(contents) - link = urlify(contents) - # link = link.split("?", 1)[0] - return "[[{1}]]".format(link, contents) - - # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) - ## question marks are ignored by etherpad, so split/strip it - ## strip slashes as well!! (/timeslider) - src = re.sub(r"\[\[(.+?)\]\]", s, src) - return (src, collect) - - -if __name__ == "__main__": - src = sys.stdin.read() - src, links = linkify(src) - - for l in links: - print (l) - - print (src)