Michael Murtaugh
9 years ago
2 changed files with 0 additions and 112 deletions
@ -1,63 +0,0 @@ |
|||||
from __future__ import print_function |
|
||||
import html5lib, sys, re |
|
||||
from xml.etree import cElementTree as ET |
|
||||
|
|
||||
|
|
||||
def contents (element, method="html"): |
|
||||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) |
|
||||
|
|
||||
def text_contents (element): |
|
||||
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') |
|
||||
|
|
||||
def set_text_contents (element, text): |
|
||||
""" ok this isn't really general, but works for singly wrapped elements """ |
|
||||
while len(element) == 1: |
|
||||
element = element[0] |
|
||||
element.text = text |
|
||||
|
|
||||
def iterparent(tree): |
|
||||
for parent in tree.iter(): |
|
||||
for child in parent: |
|
||||
yield parent, child |
|
||||
|
|
||||
def get_parent(tree, elt): |
|
||||
for parent in tree.iter(): |
|
||||
for child in parent: |
|
||||
if child == elt: |
|
||||
return parent |
|
||||
|
|
||||
def remove_recursive (tree, elt): |
|
||||
""" Remove element and (any resulting) empty containing elements """ |
|
||||
p = get_parent(tree, elt) |
|
||||
if p: |
|
||||
p.remove(elt) |
|
||||
if len(p) == 0 and (p.text == None or p.text.strip() == ""): |
|
||||
# print ("empty parent", p, file=sys.stderr) |
|
||||
remove_recursive(tree, p) |
|
||||
|
|
||||
|
|
||||
def trim_removed_spans (t): |
|
||||
# remove <span class="removed"> and empty parents |
|
||||
for n in t.findall(".//span[@class='removed']"): |
|
||||
remove_recursive(t, n) |
|
||||
# then strip any leading br's from body |
|
||||
while True: |
|
||||
tag = t.find("./body")[0] |
|
||||
if tag.tag == "br": |
|
||||
remove_recursive(t, tag) |
|
||||
else: |
|
||||
break |
|
||||
|
|
||||
def trim_removed_spans_src (src): |
|
||||
t = html5lib.parse(src, namespaceHTMLElements=False) |
|
||||
trim_removed_spans(t) |
|
||||
return contents(t.find("./body")) |
|
||||
|
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
src = sys.stdin.read() |
|
||||
# t = html5lib.parse(src, namespaceHTMLElements=False) |
|
||||
# trim_rems_tree(t) |
|
||||
# print (ET.tostring(t)) |
|
||||
print (trim_removed_spans_src(src).encode("utf-8")) |
|
||||
|
|
@ -1,49 +0,0 @@ |
|||||
from __future__ import print_function |
|
||||
import re, sys |
|
||||
|
|
||||
|
|
||||
def strip_tags (text): |
|
||||
return re.sub(r"<.*?>", "", text) |
|
||||
|
|
||||
def urlify (t, ext=".html"): |
|
||||
return t.replace(" ", "_") + ext |
|
||||
|
|
||||
def filename_to_padid (t): |
|
||||
t = t.replace("_", " ") |
|
||||
t = re.sub(r"\.html$", "", t) |
|
||||
return t |
|
||||
|
|
||||
def normalize_pad_name (n): |
|
||||
if '?' in n: |
|
||||
n = n.split('?', 1)[0] |
|
||||
if '/' in n: |
|
||||
n = n.split('/', 1)[0] |
|
||||
return n |
|
||||
|
|
||||
def linkify (src, urlify=urlify): |
|
||||
|
|
||||
collect = [] |
|
||||
|
|
||||
def s (m): |
|
||||
contents = strip_tags(m.group(1)) |
|
||||
contents = normalize_pad_name(contents) |
|
||||
collect.append(contents) |
|
||||
link = urlify(contents) |
|
||||
# link = link.split("?", 1)[0] |
|
||||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) |
|
||||
|
|
||||
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) |
|
||||
## question marks are ignored by etherpad, so split/strip it |
|
||||
## strip slashes as well!! (/timeslider) |
|
||||
src = re.sub(r"\[\[(.+?)\]\]", s, src) |
|
||||
return (src, collect) |
|
||||
|
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
src = sys.stdin.read() |
|
||||
src, links = linkify(src) |
|
||||
|
|
||||
for l in links: |
|
||||
print (l) |
|
||||
|
|
||||
print (src) |
|
Loading…
Reference in new issue