|
|
@ -10,13 +10,87 @@ from datetime import datetime |
|
|
|
from xml.etree import cElementTree as ET |
|
|
|
from urllib import urlencode |
|
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
|
# local mods |
|
|
|
from et_helpers import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
|
|
|
|
|
# external dependencies (use pip to install these) |
|
|
|
import html5lib, jinja2 |
|
|
|
|
|
|
|
|
|
|
|
def filename_to_padid (t): |
|
|
|
t = t.replace("_", " ") |
|
|
|
t = re.sub(r"\.html$", "", t) |
|
|
|
return t |
|
|
|
|
|
|
|
def normalize_pad_name (n): |
|
|
|
if '?' in n: |
|
|
|
n = n.split('?', 1)[0] |
|
|
|
if '/' in n: |
|
|
|
n = n.split('/', 1)[0] |
|
|
|
return n |
|
|
|
|
|
|
|
def urlify (t, ext=".html"): |
|
|
|
return t.replace(" ", "_") + ext |
|
|
|
|
|
|
|
def linkify (src, urlify=urlify): |
|
|
|
|
|
|
|
collect = [] |
|
|
|
|
|
|
|
def s (m): |
|
|
|
contents = strip_tags(m.group(1)) |
|
|
|
contents = normalize_pad_name(contents) |
|
|
|
collect.append(contents) |
|
|
|
link = urlify(contents) |
|
|
|
# link = link.split("?", 1)[0] |
|
|
|
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) |
|
|
|
|
|
|
|
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) |
|
|
|
## question marks are ignored by etherpad, so split/strip it |
|
|
|
## strip slashes as well!! (/timeslider) |
|
|
|
src = re.sub(r"\[\[(.+?)\]\]", s, src) |
|
|
|
return (src, collect) |
|
|
|
|
|
|
|
def strip_tags (text): |
|
|
|
return re.sub(r"<.*?>", "", text) |
|
|
|
|
|
|
|
def set_text_contents (element, text): |
|
|
|
""" ok this isn't really general, but works for singly wrapped elements """ |
|
|
|
while len(element) == 1: |
|
|
|
element = element[0] |
|
|
|
element.text = text |
|
|
|
|
|
|
|
def text_contents (element): |
|
|
|
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') |
|
|
|
|
|
|
|
def contents (element, method="html"): |
|
|
|
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) |
|
|
|
|
|
|
|
def get_parent(tree, elt): |
|
|
|
for parent in tree.iter(): |
|
|
|
for child in parent: |
|
|
|
if child == elt: |
|
|
|
return parent |
|
|
|
|
|
|
|
def remove_recursive (tree, elt): |
|
|
|
""" Remove element and (any resulting) empty containing elements """ |
|
|
|
p = get_parent(tree, elt) |
|
|
|
if p: |
|
|
|
p.remove(elt) |
|
|
|
if len(p) == 0 and (p.text == None or p.text.strip() == ""): |
|
|
|
# print ("empty parent", p, file=sys.stderr) |
|
|
|
remove_recursive(tree, p) |
|
|
|
|
|
|
|
|
|
|
|
def trim_removed_spans (t): |
|
|
|
# remove <span class="removed"> and empty parents |
|
|
|
for n in t.findall(".//span[@class='removed']"): |
|
|
|
remove_recursive(t, n) |
|
|
|
# then strip any leading br's from body |
|
|
|
while True: |
|
|
|
tag = t.find("./body")[0] |
|
|
|
if tag.tag == "br": |
|
|
|
remove_recursive(t, tag) |
|
|
|
else: |
|
|
|
break |
|
|
|
|
|
|
|
def get_template_env (tpath=None): |
|
|
|
paths = [] |
|
|
|
if tpath and os.path.isdir(tpath): |
|
|
@ -114,7 +188,13 @@ while len(todo) > 0: |
|
|
|
# | | | | | | __/ || (_| | |
|
|
|
# |_| |_| |_|\___|\__\__,_| |
|
|
|
|
|
|
|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) |
|
|
|
meta_url = urlify(padid, ext=".json") |
|
|
|
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8")) |
|
|
|
raw_url = urlify(padid, ext=".txt") |
|
|
|
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8")) |
|
|
|
colors_url = urlify(padid, ext=".html") |
|
|
|
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8")) |
|
|
|
|
|
|
|
if not args.hidepaths: |
|
|
|
print (meta_out, file=sys.stderr) |
|
|
|
if not args.pretend: |
|
|
@ -137,7 +217,9 @@ while len(todo) > 0: |
|
|
|
if args.showurls: |
|
|
|
print (authors_url, file=sys.stderr) |
|
|
|
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] |
|
|
|
|
|
|
|
meta['colors'] = colors_url |
|
|
|
meta['raw'] = raw_url |
|
|
|
meta['meta'] = meta_url |
|
|
|
with open(meta_out, "w") as f: |
|
|
|
json.dump(meta, f) |
|
|
|
|
|
|
@ -146,7 +228,6 @@ while len(todo) > 0: |
|
|
|
# | | | (_| |\ V V / |
|
|
|
# |_| \__,_| \_/\_/ |
|
|
|
|
|
|
|
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (raw_out, file=sys.stderr) |
|
|
|
text_url = apiurl+"getText?"+urlencode(data) |
|
|
@ -171,7 +252,6 @@ while len(todo) > 0: |
|
|
|
# | (_| (_) | | (_) | | \__ \ |
|
|
|
# \___\___/|_|\___/|_| |___/ |
|
|
|
|
|
|
|
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html")) |
|
|
|
if not args.hidepaths: |
|
|
|
print (colors_out, file=sys.stderr) |
|
|
|
data['startRev'] = "0" |
|
|
@ -272,7 +352,11 @@ while len(todo) > 0: |
|
|
|
style = style, |
|
|
|
revision = meta['total_revisions'], |
|
|
|
padid = padid, |
|
|
|
timestamp = datetime.now() |
|
|
|
timestamp = datetime.now(), |
|
|
|
meta_url = meta_url, |
|
|
|
raw_url = raw_url, |
|
|
|
colors_url = colors_url, |
|
|
|
lastedited = meta['lastedited'] |
|
|
|
).encode("utf-8")) |
|
|
|
|
|
|
|
# _ |
|
|
|