Michael Murtaugh
9 years ago
2 changed files with 0 additions and 112 deletions
@ -1,63 +0,0 @@ |
|||
from __future__ import print_function |
|||
import html5lib, sys, re |
|||
from xml.etree import cElementTree as ET |
|||
|
|||
|
|||
def contents (element, method="html"): |
|||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) |
|||
|
|||
def text_contents (element): |
|||
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') |
|||
|
|||
def set_text_contents (element, text): |
|||
""" ok this isn't really general, but works for singly wrapped elements """ |
|||
while len(element) == 1: |
|||
element = element[0] |
|||
element.text = text |
|||
|
|||
def iterparent(tree): |
|||
for parent in tree.iter(): |
|||
for child in parent: |
|||
yield parent, child |
|||
|
|||
def get_parent(tree, elt): |
|||
for parent in tree.iter(): |
|||
for child in parent: |
|||
if child == elt: |
|||
return parent |
|||
|
|||
def remove_recursive (tree, elt): |
|||
""" Remove element and (any resulting) empty containing elements """ |
|||
p = get_parent(tree, elt) |
|||
if p: |
|||
p.remove(elt) |
|||
if len(p) == 0 and (p.text == None or p.text.strip() == ""): |
|||
# print ("empty parent", p, file=sys.stderr) |
|||
remove_recursive(tree, p) |
|||
|
|||
|
|||
def trim_removed_spans (t): |
|||
# remove <span class="removed"> and empty parents |
|||
for n in t.findall(".//span[@class='removed']"): |
|||
remove_recursive(t, n) |
|||
# then strip any leading br's from body |
|||
while True: |
|||
tag = t.find("./body")[0] |
|||
if tag.tag == "br": |
|||
remove_recursive(t, tag) |
|||
else: |
|||
break |
|||
|
|||
def trim_removed_spans_src (src): |
|||
t = html5lib.parse(src, namespaceHTMLElements=False) |
|||
trim_removed_spans(t) |
|||
return contents(t.find("./body")) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
src = sys.stdin.read() |
|||
# t = html5lib.parse(src, namespaceHTMLElements=False) |
|||
# trim_rems_tree(t) |
|||
# print (ET.tostring(t)) |
|||
print (trim_removed_spans_src(src).encode("utf-8")) |
|||
|
@ -1,49 +0,0 @@ |
|||
from __future__ import print_function |
|||
import re, sys |
|||
|
|||
|
|||
def strip_tags (text): |
|||
return re.sub(r"<.*?>", "", text) |
|||
|
|||
def urlify (t, ext=".html"): |
|||
return t.replace(" ", "_") + ext |
|||
|
|||
def filename_to_padid (t): |
|||
t = t.replace("_", " ") |
|||
t = re.sub(r"\.html$", "", t) |
|||
return t |
|||
|
|||
def normalize_pad_name (n): |
|||
if '?' in n: |
|||
n = n.split('?', 1)[0] |
|||
if '/' in n: |
|||
n = n.split('/', 1)[0] |
|||
return n |
|||
|
|||
def linkify (src, urlify=urlify): |
|||
|
|||
collect = [] |
|||
|
|||
def s (m): |
|||
contents = strip_tags(m.group(1)) |
|||
contents = normalize_pad_name(contents) |
|||
collect.append(contents) |
|||
link = urlify(contents) |
|||
# link = link.split("?", 1)[0] |
|||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) |
|||
|
|||
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) |
|||
## question marks are ignored by etherpad, so split/strip it |
|||
## strip slashes as well!! (/timeslider) |
|||
src = re.sub(r"\[\[(.+?)\]\]", s, src) |
|||
return (src, collect) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
src = sys.stdin.read() |
|||
src, links = linkify(src) |
|||
|
|||
for l in links: |
|||
print (l) |
|||
|
|||
print (src) |
Loading…
Reference in new issue