Browse Source

etherdump post-pipelines version

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
d89c5dbd3c
  1. 100
      etherdump

100
etherdump

@ -10,13 +10,87 @@ from datetime import datetime
from xml.etree import cElementTree as ET from xml.etree import cElementTree as ET
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
# local mods
from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
# external dependencies (use pip to install these) # external dependencies (use pip to install these)
import html5lib, jinja2 import html5lib, jinja2
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def normalize_pad_name (n):
if '?' in n:
n = n.split('?', 1)[0]
if '/' in n:
n = n.split('/', 1)[0]
return n
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = strip_tags(m.group(1))
contents = normalize_pad_name(contents)
collect.append(contents)
link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
p.remove(elt)
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
else:
break
def get_template_env (tpath=None): def get_template_env (tpath=None):
paths = [] paths = []
if tpath and os.path.isdir(tpath): if tpath and os.path.isdir(tpath):
@ -114,7 +188,13 @@ while len(todo) > 0:
# | | | | | | __/ || (_| | # | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_| # |_| |_| |_|\___|\__\__,_|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) meta_url = urlify(padid, ext=".json")
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_url = urlify(padid, ext=".txt")
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
colors_url = urlify(padid, ext=".html")
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
if not args.hidepaths: if not args.hidepaths:
print (meta_out, file=sys.stderr) print (meta_out, file=sys.stderr)
if not args.pretend: if not args.pretend:
@ -137,7 +217,9 @@ while len(todo) > 0:
if args.showurls: if args.showurls:
print (authors_url, file=sys.stderr) print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
meta['colors'] = colors_url
meta['raw'] = raw_url
meta['meta'] = meta_url
with open(meta_out, "w") as f: with open(meta_out, "w") as f:
json.dump(meta, f) json.dump(meta, f)
@ -146,7 +228,6 @@ while len(todo) > 0:
# | | | (_| |\ V V / # | | | (_| |\ V V /
# |_| \__,_| \_/\_/ # |_| \__,_| \_/\_/
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
if not args.hidepaths: if not args.hidepaths:
print (raw_out, file=sys.stderr) print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data) text_url = apiurl+"getText?"+urlencode(data)
@ -171,7 +252,6 @@ while len(todo) > 0:
# | (_| (_) | | (_) | | \__ \ # | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/ # \___\___/|_|\___/|_| |___/
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
if not args.hidepaths: if not args.hidepaths:
print (colors_out, file=sys.stderr) print (colors_out, file=sys.stderr)
data['startRev'] = "0" data['startRev'] = "0"
@ -272,7 +352,11 @@ while len(todo) > 0:
style = style, style = style,
revision = meta['total_revisions'], revision = meta['total_revisions'],
padid = padid, padid = padid,
timestamp = datetime.now() timestamp = datetime.now(),
meta_url = meta_url,
raw_url = raw_url,
colors_url = colors_url,
lastedited = meta['lastedited']
).encode("utf-8")) ).encode("utf-8"))
# _ # _

Loading…
Cancel
Save