diff --git a/dump_html.py b/dump_html.py index 87b20ff..7432968 100755 --- a/dump_html.py +++ b/dump_html.py @@ -1,27 +1,17 @@ #!/usr/bin/env python from __future__ import print_function from argparse import ArgumentParser -import json, sys, os +import json, sys, os, re from datetime import datetime import html5lib from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from xml.etree import cElementTree as ET -from trim import trim_removed_spans, contents -from linkify import linkify, urlify +from trim import trim_removed_spans, contents, set_text_contents, text_contents +from linkify import linkify, urlify, filename_to_padid import jinja2 -p = ArgumentParser("") -p.add_argument("padid", help="the padid") -p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") -p.add_argument("--path", default="output", help="path to save files, default: output") -p.add_argument("--verbose", default=False, action="store_true") -p.add_argument("--limit", type=int, default=None) -p.add_argument("--templates", default="templates") -p.add_argument("--template", default="pad_html.html") -args = p.parse_args() - def get_template_env (tpath=None): paths = [] if tpath and os.path.isdir(tpath): @@ -31,10 +21,40 @@ def get_template_env (tpath=None): env = jinja2.Environment(loader=loader) return env + +p = ArgumentParser("") +p.add_argument("padid", help="the padid") +p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") +p.add_argument("--output", default="output", help="path to save files, default: output") +p.add_argument("--verbose", default=False, action="store_true") +p.add_argument("--limit", type=int, default=None) +p.add_argument("--templates", default="templates") +p.add_argument("--template", default="pad_html.html") + +p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") +p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") +p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") + +args = p.parse_args() with open(args.padinfo) as f: info = json.load(f) + apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) +padlinkpats = [] +if "padlink" in info: + if type(info['padlink']) == list: + padlinkpats.extend(info['padlink']) + else: + padlinkpats.append(info['padlink']) +padlinkpats.extend(args.padlink) + +linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] +linkpats.extend(zip(args.linksearch, args.linkreplace)) + +if args.verbose: + print ("using padlinkpats", padlinkpats) + todo = [args.padid] done = set() count = 0 @@ -51,7 +71,9 @@ while len(todo) > 0: data['apikey'] = info['apikey'] data['padID'] = padid.encode("utf-8") - out = "{0}/{1}".format(args.path, urlify(padid)) + if args.verbose: + print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) + out = "{0}/{1}".format(args.output, urlify(padid)) print ("{0}".format(out), file=sys.stderr) total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) @@ -66,6 +88,9 @@ while len(todo) > 0: trim_removed_spans(t) html = ET.tostring(t, method="html") + # Stage 1: Process as text + # Process [[wikilink]] style links + # and add linked page names to spider todo list html, links = linkify(html) for l in links: if l not in todo and l not in done: @@ -73,20 +98,58 @@ while len(todo) > 0: print (" link: {0}".format(l), file=sys.stderr) todo.append(l) + # Stage 2: Process as ElementTree + # + t = html5lib.parse(html, namespaceHTMLElements=False) + # apply linkpats + for a in t.findall(".//a"): + href = a.attrib.get("href") + original_href = href + if href: + # if args.verbose: + # print ("searching for PADLINK: {0}".format(href)) + for pat in padlinkpats: + if re.search(pat, href) != None: + # if args.verbose: + # print (" found PADLINK: {0}".format(href)) + href = re.sub(pat, "\\1.html", href) + padid = filename_to_padid(href) + set_text_contents(a, "[[{0}]]".format(padid)) + if padid not in todo and padid not in done: + if args.verbose: + print (" link: {0}".format(padid), file=sys.stderr) + todo.append(padid) + # apply linkpats + for s, r in linkpats: + href = re.sub(s, r, href) + if href != original_href: + old_contents = text_contents(a) + # print ("OLD_CONTENTS {0}".format(old_contents)) + if old_contents == original_href: + if args.verbose: + print (" Updating href IN TEXT", file=sys.stderr) + set_text_contents(a, href) + + if original_href != href: + if args.verbose: + print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) + a.attrib['href'] = href + + # extract the style tag (with authorship colors) + style = t.find(".//style") + if style != None: + style = ET.tostring(style, method="html") + else: + style = "" + # and extract the contents of the body + html = contents(t.find(".//body")) + + try: - os.makedirs(args.path) + os.makedirs(args.output) except OSError: pass with open(out, "w") as f: - t = html5lib.parse(html, namespaceHTMLElements=False) - style = t.find(".//style") - if style != None: - style = ET.tostring(style, method="html") - else: - style = "" - body = t.find(".//body") - html = contents(body) - # f.write(html.encode("utf-8")) f.write(template.render( html = html, diff --git a/linkify.py b/linkify.py index 359a0dd..852435d 100644 --- a/linkify.py +++ b/linkify.py @@ -2,20 +2,29 @@ from __future__ import print_function import re, sys +def strip_tags (text): + return re.sub(r"<.*?>", "", text) + def urlify (t): return t.replace(" ", "_") + ".html" +def filename_to_padid (t): + t = t.replace("_", " ") + t = re.sub(r"\.html$", "", t) + return t + def linkify (src, urlify=urlify): collect = [] def s (m): - contents = m.group(1) + contents = strip_tags(m.group(1)) collect.append(contents) link = urlify(contents) return "[[{1}]]".format(link, contents) - src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src) + # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src) + src = re.sub(r"\[\[(.+?)\]\]", s, src) return (src, collect) diff --git a/trim.py b/trim.py index 085cc96..912fe1b 100644 --- a/trim.py +++ b/trim.py @@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET def contents (element, method="html"): return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) +def text_contents (element): + return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '') + +def set_text_contents (element, text): + """ ok this isn't really general, but works for singly wrapped elements """ + while len(element) == 1: + element = element[0] + element.text = text + def iterparent(tree): for parent in tree.iter(): for child in parent: