from __future__ import print_function import html5lib, sys, re from xml.etree import cElementTree as ET def contents (element, method="html"): return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) def text_contents (element): return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') def set_text_contents (element, text): """ ok this isn't really general, but works for singly wrapped elements """ while len(element) == 1: element = element[0] element.text = text def iterparent(tree): for parent in tree.iter(): for child in parent: yield parent, child def get_parent(tree, elt): for parent in tree.iter(): for child in parent: if child == elt: return parent def remove_recursive (tree, elt): """ Remove element and (any resulting) empty containing elements """ p = get_parent(tree, elt) if p: p.remove(elt) if len(p) == 0 and (p.text == None or p.text.strip() == ""): # print ("empty parent", p, file=sys.stderr) remove_recursive(tree, p) def trim_removed_spans (t): # remove and empty parents for n in t.findall(".//span[@class='removed']"): remove_recursive(t, n) # then strip any leading br's from body while True: tag = t.find("./body")[0] if tag.tag == "br": remove_recursive(t, tag) else: break def trim_removed_spans_src (src): t = html5lib.parse(src, namespaceHTMLElements=False) trim_removed_spans(t) return contents(t.find("./body")) if __name__ == "__main__": src = sys.stdin.read() # t = html5lib.parse(src, namespaceHTMLElements=False) # trim_rems_tree(t) # print (ET.tostring(t)) print (trim_removed_spans_src(src).encode("utf-8"))