updated dump_html to fix links

This commit is contained in:
Michael Murtaugh 2015-07-30 13:33:39 +02:00
parent fb53c16ca6
commit d125f809fc
3 changed files with 107 additions and 26 deletions

View File

@ -1,27 +1,17 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os
import json, sys, os, re
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents
from linkify import linkify, urlify
from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
import jinja2
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
args = p.parse_args()
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
@ -31,10 +21,40 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader)
return env
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
padlinkpats = []
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid]
done = set()
count = 0
@ -51,7 +71,9 @@ while len(todo) > 0:
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid))
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
@ -66,6 +88,9 @@ while len(todo) > 0:
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
@ -73,20 +98,58 @@ while len(todo) > 0:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
try:
os.makedirs(args.path)
os.makedirs(args.output)
except OSError:
pass
with open(out, "w") as f:
t = html5lib.parse(html, namespaceHTMLElements=False)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
body = t.find(".//body")
html = contents(body)
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,

View File

@ -2,20 +2,29 @@ from __future__ import print_function
import re, sys
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def urlify (t):
return t.replace(" ", "_") + ".html"
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = m.group(1)
contents = strip_tags(m.group(1))
collect.append(contents)
link = urlify(contents)
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)

View File

@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def iterparent(tree):
for parent in tree.iter():
for child in parent: