updated dump_html to fix links

This commit is contained in:
Michael Murtaugh 2015-07-30 13:33:39 +02:00
parent fb53c16ca6
commit d125f809fc
3 changed files with 107 additions and 26 deletions

View File

@ -1,27 +1,17 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function from __future__ import print_function
from argparse import ArgumentParser from argparse import ArgumentParser
import json, sys, os import json, sys, os, re
from datetime import datetime from datetime import datetime
import html5lib import html5lib
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify from linkify import linkify, urlify, filename_to_padid
import jinja2 import jinja2
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
args = p.parse_args()
def get_template_env (tpath=None): def get_template_env (tpath=None):
paths = [] paths = []
if tpath and os.path.isdir(tpath): if tpath and os.path.isdir(tpath):
@ -31,10 +21,40 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader) env = jinja2.Environment(loader=loader)
return env return env
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
args = p.parse_args()
with open(args.padinfo) as f: with open(args.padinfo) as f:
info = json.load(f) info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
padlinkpats = []
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid] todo = [args.padid]
done = set() done = set()
count = 0 count = 0
@ -51,7 +71,9 @@ while len(todo) > 0:
data['apikey'] = info['apikey'] data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8") data['padID'] = padid.encode("utf-8")
out = "{0}/{1}".format(args.path, urlify(padid)) if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr) print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
@ -66,6 +88,9 @@ while len(todo) > 0:
trim_removed_spans(t) trim_removed_spans(t)
html = ET.tostring(t, method="html") html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
html, links = linkify(html) html, links = linkify(html)
for l in links: for l in links:
if l not in todo and l not in done: if l not in todo and l not in done:
@ -73,20 +98,58 @@ while len(todo) > 0:
print (" link: {0}".format(l), file=sys.stderr) print (" link: {0}".format(l), file=sys.stderr)
todo.append(l) todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
try: try:
os.makedirs(args.path) os.makedirs(args.output)
except OSError: except OSError:
pass pass
with open(out, "w") as f: with open(out, "w") as f:
t = html5lib.parse(html, namespaceHTMLElements=False)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
body = t.find(".//body")
html = contents(body)
# f.write(html.encode("utf-8")) # f.write(html.encode("utf-8"))
f.write(template.render( f.write(template.render(
html = html, html = html,

View File

@ -2,20 +2,29 @@ from __future__ import print_function
import re, sys import re, sys
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def urlify (t): def urlify (t):
return t.replace(" ", "_") + ".html" return t.replace(" ", "_") + ".html"
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def linkify (src, urlify=urlify): def linkify (src, urlify=urlify):
collect = [] collect = []
def s (m): def s (m):
contents = m.group(1) contents = strip_tags(m.group(1))
collect.append(contents) collect.append(contents)
link = urlify(contents) link = urlify(contents)
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents) return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src) # src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect) return (src, collect)

View File

@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET
def contents (element, method="html"): def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element]) return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def iterparent(tree): def iterparent(tree):
for parent in tree.iter(): for parent in tree.iter():
for child in parent: for child in parent: