updated dump_html to fix links

2015-07-30 13:33:39 +02:00 · 2015-07-30 13:33:39 +02:00 · d125f809fc
commit d125f809fc
parent fb53c16ca6
3 changed files with 107 additions and 26 deletions
--- a/dump_html.py
+++ b/dump_html.py
@ -1,27 +1,17 @@
 #!/usr/bin/env python
 from __future__ import print_function
 from argparse import ArgumentParser
-import json, sys, os
+import json, sys, os, re
 from datetime import datetime
 import html5lib
 from urllib import urlencode
 from urllib2 import urlopen, HTTPError, URLError
 from xml.etree import cElementTree as ET 
-from trim import trim_removed_spans, contents
+from trim import trim_removed_spans, contents, set_text_contents, text_contents
-from linkify import linkify, urlify
+from linkify import linkify, urlify, filename_to_padid
 import jinja2
 p = ArgumentParser("")
 p.add_argument("padid", help="the padid")
 p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
 p.add_argument("--path", default="output", help="path to save files, default: output")
 p.add_argument("--verbose", default=False, action="store_true")
 p.add_argument("--limit", type=int, default=None)
 p.add_argument("--templates", default="templates")
 p.add_argument("--template", default="pad_html.html")
 args = p.parse_args()
 def get_template_env (tpath=None):
    paths = []
    if tpath and os.path.isdir(tpath):
@ -31,10 +21,40 @@ def get_template_env (tpath=None):
    env = jinja2.Environment(loader=loader)
    return env
 p = ArgumentParser("")
 p.add_argument("padid", help="the padid")
 p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
 p.add_argument("--output", default="output", help="path to save files, default: output")
 p.add_argument("--verbose", default=False, action="store_true")
 p.add_argument("--limit", type=int, default=None)
 p.add_argument("--templates", default="templates")
 p.add_argument("--template", default="pad_html.html")
 p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
 p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
 p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
 args = p.parse_args()
 with open(args.padinfo) as f:
    info = json.load(f)
 apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
 padlinkpats = []
 if "padlink" in info:
    if type(info['padlink']) == list:
        padlinkpats.extend(info['padlink'])
    else:
        padlinkpats.append(info['padlink'])
 padlinkpats.extend(args.padlink)
 linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
 linkpats.extend(zip(args.linksearch, args.linkreplace))
 if args.verbose:
    print ("using padlinkpats", padlinkpats)
 todo = [args.padid]
 done = set()
 count = 0
@ -51,7 +71,9 @@ while len(todo) > 0:
    data['apikey'] = info['apikey']
    data['padID'] = padid.encode("utf-8")
-    out = "{0}/{1}".format(args.path, urlify(padid))
+    if args.verbose:
        print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
    out = "{0}/{1}".format(args.output, urlify(padid))
    print ("{0}".format(out), file=sys.stderr)
    total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
@ -66,6 +88,9 @@ while len(todo) > 0:
    trim_removed_spans(t)
    html = ET.tostring(t, method="html")
    # Stage 1: Process as text
    # Process [[wikilink]] style links
    # and add linked page names to spider todo list
    html, links = linkify(html)
    for l in links:
        if l not in todo and l not in done:
@ -73,20 +98,58 @@ while len(todo) > 0:
                print ("  link: {0}".format(l), file=sys.stderr)
            todo.append(l)
    # Stage 2: Process as ElementTree
    #
    t = html5lib.parse(html, namespaceHTMLElements=False)
    # apply linkpats
    for a in t.findall(".//a"):
        href = a.attrib.get("href")
        original_href = href
        if href:
            # if args.verbose:
            #     print ("searching for PADLINK: {0}".format(href))
            for pat in padlinkpats:
                if re.search(pat, href) != None:
                    # if args.verbose:
                    #     print ("  found PADLINK: {0}".format(href))
                    href = re.sub(pat, "\\1.html", href)
                    padid = filename_to_padid(href)
                    set_text_contents(a, "[[{0}]]".format(padid))
                    if padid not in todo and padid not in done:
                        if args.verbose:
                            print ("  link: {0}".format(padid), file=sys.stderr)
                        todo.append(padid)
            # apply linkpats
            for s, r in linkpats:
                href = re.sub(s, r, href)
                if href != original_href:
                    old_contents = text_contents(a)
                    # print ("OLD_CONTENTS {0}".format(old_contents))
                    if old_contents == original_href:
                        if args.verbose:
                            print ("   Updating href IN TEXT", file=sys.stderr)
                        set_text_contents(a, href)
            if original_href != href:
                if args.verbose:
                    print ("  Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
                a.attrib['href'] = href
    # extract the style tag (with authorship colors)
    style = t.find(".//style")
    if style != None:
        style = ET.tostring(style, method="html")
    else:
        style = ""
    # and extract the contents of the body
    html = contents(t.find(".//body"))
    try:
-        os.makedirs(args.path)
+        os.makedirs(args.output)
    except OSError:
        pass
    with open(out, "w") as f:
        t = html5lib.parse(html, namespaceHTMLElements=False)
        style = t.find(".//style")
        if style != None:
            style = ET.tostring(style, method="html")
        else:
            style = ""
        body = t.find(".//body")
        html = contents(body)
        # f.write(html.encode("utf-8"))
        f.write(template.render(
            html = html,
--- a/linkify.py
+++ b/linkify.py
@ -2,20 +2,29 @@ from __future__ import print_function
 import re, sys
 def strip_tags (text):
 	return re.sub(r"<.*?>", "", text)
 def urlify (t):
 	return t.replace(" ", "_") + ".html"
 def filename_to_padid (t):
 	t = t.replace("_", " ")
 	t = re.sub(r"\.html$", "", t)
 	return t
 def linkify (src, urlify=urlify):
 	collect = []
 	def s (m):
-		contents = m.group(1)
+		contents = strip_tags(m.group(1))
 		collect.append(contents)
 		link = urlify(contents)
 		return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
-	src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
+	# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
 	src = re.sub(r"\[\[(.+?)\]\]", s, src)
 	return (src, collect)
--- a/trim.py
+++ b/trim.py
@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET
 def contents (element, method="html"):
    return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
 def text_contents (element):
    return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
 def set_text_contents (element, text):
    """ ok this isn't really general, but works for singly wrapped elements """
    while len(element) == 1:
        element = element[0]
    element.text = text
 def iterparent(tree):
    for parent in tree.iter():
        for child in parent: