removed external routines, now part of etherdump script itself
This commit is contained in:
parent
d89c5dbd3c
commit
33258cdcb1
@ -1,63 +0,0 @@
|
||||
from __future__ import print_function
|
||||
import html5lib, sys, re
|
||||
from xml.etree import cElementTree as ET
|
||||
|
||||
|
||||
def contents (element, method="html"):
|
||||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
||||
|
||||
def text_contents (element):
|
||||
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
|
||||
|
||||
def set_text_contents (element, text):
|
||||
""" ok this isn't really general, but works for singly wrapped elements """
|
||||
while len(element) == 1:
|
||||
element = element[0]
|
||||
element.text = text
|
||||
|
||||
def iterparent(tree):
|
||||
for parent in tree.iter():
|
||||
for child in parent:
|
||||
yield parent, child
|
||||
|
||||
def get_parent(tree, elt):
|
||||
for parent in tree.iter():
|
||||
for child in parent:
|
||||
if child == elt:
|
||||
return parent
|
||||
|
||||
def remove_recursive (tree, elt):
|
||||
""" Remove element and (any resulting) empty containing elements """
|
||||
p = get_parent(tree, elt)
|
||||
if p:
|
||||
p.remove(elt)
|
||||
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
|
||||
# print ("empty parent", p, file=sys.stderr)
|
||||
remove_recursive(tree, p)
|
||||
|
||||
|
||||
def trim_removed_spans (t):
|
||||
# remove <span class="removed"> and empty parents
|
||||
for n in t.findall(".//span[@class='removed']"):
|
||||
remove_recursive(t, n)
|
||||
# then strip any leading br's from body
|
||||
while True:
|
||||
tag = t.find("./body")[0]
|
||||
if tag.tag == "br":
|
||||
remove_recursive(t, tag)
|
||||
else:
|
||||
break
|
||||
|
||||
def trim_removed_spans_src (src):
|
||||
t = html5lib.parse(src, namespaceHTMLElements=False)
|
||||
trim_removed_spans(t)
|
||||
return contents(t.find("./body"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
src = sys.stdin.read()
|
||||
# t = html5lib.parse(src, namespaceHTMLElements=False)
|
||||
# trim_rems_tree(t)
|
||||
# print (ET.tostring(t))
|
||||
print (trim_removed_spans_src(src).encode("utf-8"))
|
||||
|
49
linkify.py
49
linkify.py
@ -1,49 +0,0 @@
|
||||
from __future__ import print_function
|
||||
import re, sys
|
||||
|
||||
|
||||
def strip_tags (text):
|
||||
return re.sub(r"<.*?>", "", text)
|
||||
|
||||
def urlify (t, ext=".html"):
|
||||
return t.replace(" ", "_") + ext
|
||||
|
||||
def filename_to_padid (t):
|
||||
t = t.replace("_", " ")
|
||||
t = re.sub(r"\.html$", "", t)
|
||||
return t
|
||||
|
||||
def normalize_pad_name (n):
|
||||
if '?' in n:
|
||||
n = n.split('?', 1)[0]
|
||||
if '/' in n:
|
||||
n = n.split('/', 1)[0]
|
||||
return n
|
||||
|
||||
def linkify (src, urlify=urlify):
|
||||
|
||||
collect = []
|
||||
|
||||
def s (m):
|
||||
contents = strip_tags(m.group(1))
|
||||
contents = normalize_pad_name(contents)
|
||||
collect.append(contents)
|
||||
link = urlify(contents)
|
||||
# link = link.split("?", 1)[0]
|
||||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
||||
|
||||
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
|
||||
## question marks are ignored by etherpad, so split/strip it
|
||||
## strip slashes as well!! (/timeslider)
|
||||
src = re.sub(r"\[\[(.+?)\]\]", s, src)
|
||||
return (src, collect)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
src = sys.stdin.read()
|
||||
src, links = linkify(src)
|
||||
|
||||
for l in links:
|
||||
print (l)
|
||||
|
||||
print (src)
|
Loading…
Reference in New Issue
Block a user