removed external routines, now part of etherdump script itself
This commit is contained in:
parent
d89c5dbd3c
commit
33258cdcb1
@ -1,63 +0,0 @@
|
|||||||
from __future__ import print_function
|
|
||||||
import html5lib, sys, re
|
|
||||||
from xml.etree import cElementTree as ET
|
|
||||||
|
|
||||||
|
|
||||||
def contents (element, method="html"):
|
|
||||||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
|
||||||
|
|
||||||
def text_contents (element):
|
|
||||||
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
|
|
||||||
|
|
||||||
def set_text_contents (element, text):
|
|
||||||
""" ok this isn't really general, but works for singly wrapped elements """
|
|
||||||
while len(element) == 1:
|
|
||||||
element = element[0]
|
|
||||||
element.text = text
|
|
||||||
|
|
||||||
def iterparent(tree):
|
|
||||||
for parent in tree.iter():
|
|
||||||
for child in parent:
|
|
||||||
yield parent, child
|
|
||||||
|
|
||||||
def get_parent(tree, elt):
|
|
||||||
for parent in tree.iter():
|
|
||||||
for child in parent:
|
|
||||||
if child == elt:
|
|
||||||
return parent
|
|
||||||
|
|
||||||
def remove_recursive (tree, elt):
|
|
||||||
""" Remove element and (any resulting) empty containing elements """
|
|
||||||
p = get_parent(tree, elt)
|
|
||||||
if p:
|
|
||||||
p.remove(elt)
|
|
||||||
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
|
|
||||||
# print ("empty parent", p, file=sys.stderr)
|
|
||||||
remove_recursive(tree, p)
|
|
||||||
|
|
||||||
|
|
||||||
def trim_removed_spans (t):
|
|
||||||
# remove <span class="removed"> and empty parents
|
|
||||||
for n in t.findall(".//span[@class='removed']"):
|
|
||||||
remove_recursive(t, n)
|
|
||||||
# then strip any leading br's from body
|
|
||||||
while True:
|
|
||||||
tag = t.find("./body")[0]
|
|
||||||
if tag.tag == "br":
|
|
||||||
remove_recursive(t, tag)
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
|
|
||||||
def trim_removed_spans_src (src):
|
|
||||||
t = html5lib.parse(src, namespaceHTMLElements=False)
|
|
||||||
trim_removed_spans(t)
|
|
||||||
return contents(t.find("./body"))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
src = sys.stdin.read()
|
|
||||||
# t = html5lib.parse(src, namespaceHTMLElements=False)
|
|
||||||
# trim_rems_tree(t)
|
|
||||||
# print (ET.tostring(t))
|
|
||||||
print (trim_removed_spans_src(src).encode("utf-8"))
|
|
||||||
|
|
49
linkify.py
49
linkify.py
@ -1,49 +0,0 @@
|
|||||||
from __future__ import print_function
|
|
||||||
import re, sys
|
|
||||||
|
|
||||||
|
|
||||||
def strip_tags (text):
|
|
||||||
return re.sub(r"<.*?>", "", text)
|
|
||||||
|
|
||||||
def urlify (t, ext=".html"):
|
|
||||||
return t.replace(" ", "_") + ext
|
|
||||||
|
|
||||||
def filename_to_padid (t):
|
|
||||||
t = t.replace("_", " ")
|
|
||||||
t = re.sub(r"\.html$", "", t)
|
|
||||||
return t
|
|
||||||
|
|
||||||
def normalize_pad_name (n):
|
|
||||||
if '?' in n:
|
|
||||||
n = n.split('?', 1)[0]
|
|
||||||
if '/' in n:
|
|
||||||
n = n.split('/', 1)[0]
|
|
||||||
return n
|
|
||||||
|
|
||||||
def linkify (src, urlify=urlify):
|
|
||||||
|
|
||||||
collect = []
|
|
||||||
|
|
||||||
def s (m):
|
|
||||||
contents = strip_tags(m.group(1))
|
|
||||||
contents = normalize_pad_name(contents)
|
|
||||||
collect.append(contents)
|
|
||||||
link = urlify(contents)
|
|
||||||
# link = link.split("?", 1)[0]
|
|
||||||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
|
||||||
|
|
||||||
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
|
|
||||||
## question marks are ignored by etherpad, so split/strip it
|
|
||||||
## strip slashes as well!! (/timeslider)
|
|
||||||
src = re.sub(r"\[\[(.+?)\]\]", s, src)
|
|
||||||
return (src, collect)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
src = sys.stdin.read()
|
|
||||||
src, links = linkify(src)
|
|
||||||
|
|
||||||
for l in links:
|
|
||||||
print (l)
|
|
||||||
|
|
||||||
print (src)
|
|
Loading…
Reference in New Issue
Block a user