removed external routines, now part of etherdump script itself

This commit is contained in:
Michael Murtaugh 2015-09-17 11:35:02 +02:00
parent d89c5dbd3c
commit 33258cdcb1
2 changed files with 0 additions and 112 deletions

View File

@ -1,63 +0,0 @@
from __future__ import print_function
import html5lib, sys, re
from xml.etree import cElementTree as ET
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def iterparent(tree):
for parent in tree.iter():
for child in parent:
yield parent, child
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
def trim_removed_spans_src (src):
t = html5lib.parse(src, namespaceHTMLElements=False)
return contents(t.find("./body"))
if __name__ == "__main__":
src =
# t = html5lib.parse(src, namespaceHTMLElements=False)
# trim_rems_tree(t)
# print (ET.tostring(t))
print (trim_removed_spans_src(src).encode("utf-8"))

View File

@ -1,49 +0,0 @@
from __future__ import print_function
import re, sys
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def normalize_pad_name (n):
if '?' in n:
n = n.split('?', 1)[0]
if '/' in n:
n = n.split('/', 1)[0]
return n
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = strip_tags(
contents = normalize_pad_name(contents)
link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
if __name__ == "__main__":
src =
src, links = linkify(src)
for l in links:
print (l)
print (src)