55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
from __future__ import print_function
|
|
import html5lib, sys, re
|
|
from xml.etree import cElementTree as ET
|
|
|
|
|
|
def contents (element, method="html"):
|
|
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
|
|
|
def iterparent(tree):
|
|
for parent in tree.iter():
|
|
for child in parent:
|
|
yield parent, child
|
|
|
|
def get_parent(tree, elt):
|
|
for parent in tree.iter():
|
|
for child in parent:
|
|
if child == elt:
|
|
return parent
|
|
|
|
def remove_recursive (tree, elt):
|
|
""" Remove element and (any resulting) empty containing elements """
|
|
p = get_parent(tree, elt)
|
|
if p:
|
|
p.remove(elt)
|
|
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
|
|
# print ("empty parent", p, file=sys.stderr)
|
|
remove_recursive(tree, p)
|
|
|
|
|
|
def trim_removed_spans (t):
|
|
# remove <span class="removed"> and empty parents
|
|
for n in t.findall(".//span[@class='removed']"):
|
|
remove_recursive(t, n)
|
|
# then strip any leading br's from body
|
|
while True:
|
|
tag = t.find("./body")[0]
|
|
if tag.tag == "br":
|
|
remove_recursive(t, tag)
|
|
else:
|
|
break
|
|
|
|
def trim_removed_spans_src (src):
|
|
t = html5lib.parse(src, namespaceHTMLElements=False)
|
|
trim_removed_spans(t)
|
|
return contents(t.find("./body"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
src = sys.stdin.read()
|
|
# t = html5lib.parse(src, namespaceHTMLElements=False)
|
|
# trim_rems_tree(t)
|
|
# print (ET.tostring(t))
|
|
print (trim_removed_spans_src(src).encode("utf-8"))
|
|
|