updated dump_html to fix links
This commit is contained in:
parent
fb53c16ca6
commit
d125f809fc
111
dump_html.py
111
dump_html.py
@ -1,27 +1,17 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import json, sys, os
|
import json, sys, os, re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import html5lib
|
import html5lib
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from urllib2 import urlopen, HTTPError, URLError
|
from urllib2 import urlopen, HTTPError, URLError
|
||||||
from xml.etree import cElementTree as ET
|
from xml.etree import cElementTree as ET
|
||||||
from trim import trim_removed_spans, contents
|
from trim import trim_removed_spans, contents, set_text_contents, text_contents
|
||||||
from linkify import linkify, urlify
|
from linkify import linkify, urlify, filename_to_padid
|
||||||
import jinja2
|
import jinja2
|
||||||
|
|
||||||
|
|
||||||
p = ArgumentParser("")
|
|
||||||
p.add_argument("padid", help="the padid")
|
|
||||||
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
|
|
||||||
p.add_argument("--path", default="output", help="path to save files, default: output")
|
|
||||||
p.add_argument("--verbose", default=False, action="store_true")
|
|
||||||
p.add_argument("--limit", type=int, default=None)
|
|
||||||
p.add_argument("--templates", default="templates")
|
|
||||||
p.add_argument("--template", default="pad_html.html")
|
|
||||||
args = p.parse_args()
|
|
||||||
|
|
||||||
def get_template_env (tpath=None):
|
def get_template_env (tpath=None):
|
||||||
paths = []
|
paths = []
|
||||||
if tpath and os.path.isdir(tpath):
|
if tpath and os.path.isdir(tpath):
|
||||||
@ -31,10 +21,40 @@ def get_template_env (tpath=None):
|
|||||||
env = jinja2.Environment(loader=loader)
|
env = jinja2.Environment(loader=loader)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
p = ArgumentParser("")
|
||||||
|
p.add_argument("padid", help="the padid")
|
||||||
|
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
|
||||||
|
p.add_argument("--output", default="output", help="path to save files, default: output")
|
||||||
|
p.add_argument("--verbose", default=False, action="store_true")
|
||||||
|
p.add_argument("--limit", type=int, default=None)
|
||||||
|
p.add_argument("--templates", default="templates")
|
||||||
|
p.add_argument("--template", default="pad_html.html")
|
||||||
|
|
||||||
|
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
|
||||||
|
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
|
||||||
|
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
|
||||||
|
|
||||||
|
args = p.parse_args()
|
||||||
with open(args.padinfo) as f:
|
with open(args.padinfo) as f:
|
||||||
info = json.load(f)
|
info = json.load(f)
|
||||||
|
|
||||||
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
|
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
|
||||||
|
|
||||||
|
padlinkpats = []
|
||||||
|
if "padlink" in info:
|
||||||
|
if type(info['padlink']) == list:
|
||||||
|
padlinkpats.extend(info['padlink'])
|
||||||
|
else:
|
||||||
|
padlinkpats.append(info['padlink'])
|
||||||
|
padlinkpats.extend(args.padlink)
|
||||||
|
|
||||||
|
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
|
||||||
|
linkpats.extend(zip(args.linksearch, args.linkreplace))
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
print ("using padlinkpats", padlinkpats)
|
||||||
|
|
||||||
todo = [args.padid]
|
todo = [args.padid]
|
||||||
done = set()
|
done = set()
|
||||||
count = 0
|
count = 0
|
||||||
@ -51,7 +71,9 @@ while len(todo) > 0:
|
|||||||
data['apikey'] = info['apikey']
|
data['apikey'] = info['apikey']
|
||||||
data['padID'] = padid.encode("utf-8")
|
data['padID'] = padid.encode("utf-8")
|
||||||
|
|
||||||
out = "{0}/{1}".format(args.path, urlify(padid))
|
if args.verbose:
|
||||||
|
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
|
||||||
|
out = "{0}/{1}".format(args.output, urlify(padid))
|
||||||
print ("{0}".format(out), file=sys.stderr)
|
print ("{0}".format(out), file=sys.stderr)
|
||||||
|
|
||||||
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
|
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
|
||||||
@ -66,6 +88,9 @@ while len(todo) > 0:
|
|||||||
trim_removed_spans(t)
|
trim_removed_spans(t)
|
||||||
html = ET.tostring(t, method="html")
|
html = ET.tostring(t, method="html")
|
||||||
|
|
||||||
|
# Stage 1: Process as text
|
||||||
|
# Process [[wikilink]] style links
|
||||||
|
# and add linked page names to spider todo list
|
||||||
html, links = linkify(html)
|
html, links = linkify(html)
|
||||||
for l in links:
|
for l in links:
|
||||||
if l not in todo and l not in done:
|
if l not in todo and l not in done:
|
||||||
@ -73,20 +98,58 @@ while len(todo) > 0:
|
|||||||
print (" link: {0}".format(l), file=sys.stderr)
|
print (" link: {0}".format(l), file=sys.stderr)
|
||||||
todo.append(l)
|
todo.append(l)
|
||||||
|
|
||||||
|
# Stage 2: Process as ElementTree
|
||||||
|
#
|
||||||
|
t = html5lib.parse(html, namespaceHTMLElements=False)
|
||||||
|
# apply linkpats
|
||||||
|
for a in t.findall(".//a"):
|
||||||
|
href = a.attrib.get("href")
|
||||||
|
original_href = href
|
||||||
|
if href:
|
||||||
|
# if args.verbose:
|
||||||
|
# print ("searching for PADLINK: {0}".format(href))
|
||||||
|
for pat in padlinkpats:
|
||||||
|
if re.search(pat, href) != None:
|
||||||
|
# if args.verbose:
|
||||||
|
# print (" found PADLINK: {0}".format(href))
|
||||||
|
href = re.sub(pat, "\\1.html", href)
|
||||||
|
padid = filename_to_padid(href)
|
||||||
|
set_text_contents(a, "[[{0}]]".format(padid))
|
||||||
|
if padid not in todo and padid not in done:
|
||||||
|
if args.verbose:
|
||||||
|
print (" link: {0}".format(padid), file=sys.stderr)
|
||||||
|
todo.append(padid)
|
||||||
|
# apply linkpats
|
||||||
|
for s, r in linkpats:
|
||||||
|
href = re.sub(s, r, href)
|
||||||
|
if href != original_href:
|
||||||
|
old_contents = text_contents(a)
|
||||||
|
# print ("OLD_CONTENTS {0}".format(old_contents))
|
||||||
|
if old_contents == original_href:
|
||||||
|
if args.verbose:
|
||||||
|
print (" Updating href IN TEXT", file=sys.stderr)
|
||||||
|
set_text_contents(a, href)
|
||||||
|
|
||||||
|
if original_href != href:
|
||||||
|
if args.verbose:
|
||||||
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
|
||||||
|
a.attrib['href'] = href
|
||||||
|
|
||||||
|
# extract the style tag (with authorship colors)
|
||||||
|
style = t.find(".//style")
|
||||||
|
if style != None:
|
||||||
|
style = ET.tostring(style, method="html")
|
||||||
|
else:
|
||||||
|
style = ""
|
||||||
|
# and extract the contents of the body
|
||||||
|
html = contents(t.find(".//body"))
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(args.path)
|
os.makedirs(args.output)
|
||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
with open(out, "w") as f:
|
with open(out, "w") as f:
|
||||||
t = html5lib.parse(html, namespaceHTMLElements=False)
|
|
||||||
style = t.find(".//style")
|
|
||||||
if style != None:
|
|
||||||
style = ET.tostring(style, method="html")
|
|
||||||
else:
|
|
||||||
style = ""
|
|
||||||
body = t.find(".//body")
|
|
||||||
html = contents(body)
|
|
||||||
|
|
||||||
# f.write(html.encode("utf-8"))
|
# f.write(html.encode("utf-8"))
|
||||||
f.write(template.render(
|
f.write(template.render(
|
||||||
html = html,
|
html = html,
|
||||||
|
13
linkify.py
13
linkify.py
@ -2,20 +2,29 @@ from __future__ import print_function
|
|||||||
import re, sys
|
import re, sys
|
||||||
|
|
||||||
|
|
||||||
|
def strip_tags (text):
|
||||||
|
return re.sub(r"<.*?>", "", text)
|
||||||
|
|
||||||
def urlify (t):
|
def urlify (t):
|
||||||
return t.replace(" ", "_") + ".html"
|
return t.replace(" ", "_") + ".html"
|
||||||
|
|
||||||
|
def filename_to_padid (t):
|
||||||
|
t = t.replace("_", " ")
|
||||||
|
t = re.sub(r"\.html$", "", t)
|
||||||
|
return t
|
||||||
|
|
||||||
def linkify (src, urlify=urlify):
|
def linkify (src, urlify=urlify):
|
||||||
|
|
||||||
collect = []
|
collect = []
|
||||||
|
|
||||||
def s (m):
|
def s (m):
|
||||||
contents = m.group(1)
|
contents = strip_tags(m.group(1))
|
||||||
collect.append(contents)
|
collect.append(contents)
|
||||||
link = urlify(contents)
|
link = urlify(contents)
|
||||||
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
|
||||||
|
|
||||||
src = re.sub(r"\[\[([\w_\- ]+?)\]\]", s, src)
|
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
|
||||||
|
src = re.sub(r"\[\[(.+?)\]\]", s, src)
|
||||||
return (src, collect)
|
return (src, collect)
|
||||||
|
|
||||||
|
|
||||||
|
9
trim.py
9
trim.py
@ -6,6 +6,15 @@ from xml.etree import cElementTree as ET
|
|||||||
def contents (element, method="html"):
|
def contents (element, method="html"):
|
||||||
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
|
||||||
|
|
||||||
|
def text_contents (element):
|
||||||
|
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
|
||||||
|
|
||||||
|
def set_text_contents (element, text):
|
||||||
|
""" ok this isn't really general, but works for singly wrapped elements """
|
||||||
|
while len(element) == 1:
|
||||||
|
element = element[0]
|
||||||
|
element.text = text
|
||||||
|
|
||||||
def iterparent(tree):
|
def iterparent(tree):
|
||||||
for parent in tree.iter():
|
for parent in tree.iter():
|
||||||
for child in parent:
|
for child in parent:
|
||||||
|
Loading…
Reference in New Issue
Block a user