diff --git a/dump_html.py b/dump_html.py
index 87b20ff..7432968 100755
--- a/dump_html.py
+++ b/dump_html.py
@@ -1,27 +1,17 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
-import json, sys, os
+import json, sys, os, re
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
-from trim import trim_removed_spans, contents
-from linkify import linkify, urlify
+from trim import trim_removed_spans, contents, set_text_contents, text_contents
+from linkify import linkify, urlify, filename_to_padid
import jinja2
-p = ArgumentParser("")
-p.add_argument("padid", help="the padid")
-p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
-p.add_argument("--path", default="output", help="path to save files, default: output")
-p.add_argument("--verbose", default=False, action="store_true")
-p.add_argument("--limit", type=int, default=None)
-p.add_argument("--templates", default="templates")
-p.add_argument("--template", default="pad_html.html")
-args = p.parse_args()
-
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
@@ -31,10 +21,40 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader)
return env
+
+p = ArgumentParser("")
+p.add_argument("padid", help="the padid")
+p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
+p.add_argument("--output", default="output", help="path to save files, default: output")
+p.add_argument("--verbose", default=False, action="store_true")
+p.add_argument("--limit", type=int, default=None)
+p.add_argument("--templates", default="templates")
+p.add_argument("--template", default="pad_html.html")
+
+p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
+p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
+p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
+
+args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
+
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
+padlinkpats = []
+if "padlink" in info:
+ if type(info['padlink']) == list:
+ padlinkpats.extend(info['padlink'])
+ else:
+ padlinkpats.append(info['padlink'])
+padlinkpats.extend(args.padlink)
+
+linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
+linkpats.extend(zip(args.linksearch, args.linkreplace))
+
+if args.verbose:
+ print ("using padlinkpats", padlinkpats)
+
todo = [args.padid]
done = set()
count = 0
@@ -51,7 +71,9 @@ while len(todo) > 0:
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
- out = "{0}/{1}".format(args.path, urlify(padid))
+ if args.verbose:
+ print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
+ out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
@@ -66,6 +88,9 @@ while len(todo) > 0:
trim_removed_spans(t)
html = ET.tostring(t, method="html")
+ # Stage 1: Process as text
+ # Process [[wikilink]] style links
+ # and add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
@@ -73,20 +98,58 @@ while len(todo) > 0:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
+ # Stage 2: Process as ElementTree
+ #
+ t = html5lib.parse(html, namespaceHTMLElements=False)
+ # apply linkpats
+ for a in t.findall(".//a"):
+ href = a.attrib.get("href")
+ original_href = href
+ if href:
+ # if args.verbose:
+ # print ("searching for PADLINK: {0}".format(href))
+ for pat in padlinkpats:
+ if re.search(pat, href) != None:
+ # if args.verbose:
+ # print (" found PADLINK: {0}".format(href))
+ href = re.sub(pat, "\\1.html", href)
+ padid = filename_to_padid(href)
+ set_text_contents(a, "[[{0}]]".format(padid))
+ if padid not in todo and padid not in done:
+ if args.verbose:
+ print (" link: {0}".format(padid), file=sys.stderr)
+ todo.append(padid)
+ # apply linkpats
+ for s, r in linkpats:
+ href = re.sub(s, r, href)
+ if href != original_href:
+ old_contents = text_contents(a)
+ # print ("OLD_CONTENTS {0}".format(old_contents))
+ if old_contents == original_href:
+ if args.verbose:
+ print (" Updating href IN TEXT", file=sys.stderr)
+ set_text_contents(a, href)
+
+ if original_href != href:
+ if args.verbose:
+ print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
+ a.attrib['href'] = href
+
+ # extract the style tag (with authorship colors)
+ style = t.find(".//style")
+ if style != None:
+ style = ET.tostring(style, method="html")
+ else:
+ style = ""
+ # and extract the contents of the body
+ html = contents(t.find(".//body"))
+
+
try:
- os.makedirs(args.path)
+ os.makedirs(args.output)
except OSError:
pass
with open(out, "w") as f:
- t = html5lib.parse(html, namespaceHTMLElements=False)
- style = t.find(".//style")
- if style != None:
- style = ET.tostring(style, method="html")
- else:
- style = ""
- body = t.find(".//body")
- html = contents(body)
-
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
diff --git a/linkify.py b/linkify.py
index 359a0dd..852435d 100644
--- a/linkify.py
+++ b/linkify.py
@@ -2,20 +2,29 @@ from __future__ import print_function
import re, sys
+def strip_tags (text):
+ return re.sub(r"<.*?>", "", text)
+
def urlify (t):
return t.replace(" ", "_") + ".html"
+def filename_to_padid (t):
+ t = t.replace("_", " ")
+ t = re.sub(r"\.html$", "", t)
+ return t
+
def linkify (src, urlify=urlify):
collect = []
def s (m):
- contents = m.group(1)
+ contents = strip_tags(m.group(1))
collect.append(contents)
link = urlify(contents)
return "[[{1}]]".format(link, contents)
- src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
+ # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
+ src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
diff --git a/trim.py b/trim.py
index 085cc96..912fe1b 100644
--- a/trim.py
+++ b/trim.py
@@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
+def text_contents (element):
+ return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
+
+def set_text_contents (element, text):
+ """ ok this isn't really general, but works for singly wrapped elements """
+ while len(element) == 1:
+ element = element[0]
+ element.text = text
+
def iterparent(tree):
for parent in tree.iter():
for child in parent: