|
|
@ -1,27 +1,17 @@ |
|
|
|
#!/usr/bin/env python |
|
|
|
from __future__ import print_function |
|
|
|
from argparse import ArgumentParser |
|
|
|
import json, sys, os |
|
|
|
import json, sys, os, re |
|
|
|
from datetime import datetime |
|
|
|
import html5lib |
|
|
|
from urllib import urlencode |
|
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
|
from xml.etree import cElementTree as ET |
|
|
|
from trim import trim_removed_spans, contents |
|
|
|
from linkify import linkify, urlify |
|
|
|
from trim import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
|
import jinja2 |
|
|
|
|
|
|
|
|
|
|
|
p = ArgumentParser("") |
|
|
|
p.add_argument("padid", help="the padid") |
|
|
|
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") |
|
|
|
p.add_argument("--path", default="output", help="path to save files, default: output") |
|
|
|
p.add_argument("--verbose", default=False, action="store_true") |
|
|
|
p.add_argument("--limit", type=int, default=None) |
|
|
|
p.add_argument("--templates", default="templates") |
|
|
|
p.add_argument("--template", default="pad_html.html") |
|
|
|
args = p.parse_args() |
|
|
|
|
|
|
|
def get_template_env (tpath=None): |
|
|
|
paths = [] |
|
|
|
if tpath and os.path.isdir(tpath): |
|
|
@ -31,10 +21,40 @@ def get_template_env (tpath=None): |
|
|
|
env = jinja2.Environment(loader=loader) |
|
|
|
return env |
|
|
|
|
|
|
|
|
|
|
|
p = ArgumentParser("") |
|
|
|
p.add_argument("padid", help="the padid") |
|
|
|
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") |
|
|
|
p.add_argument("--output", default="output", help="path to save files, default: output") |
|
|
|
p.add_argument("--verbose", default=False, action="store_true") |
|
|
|
p.add_argument("--limit", type=int, default=None) |
|
|
|
p.add_argument("--templates", default="templates") |
|
|
|
p.add_argument("--template", default="pad_html.html") |
|
|
|
|
|
|
|
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") |
|
|
|
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") |
|
|
|
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") |
|
|
|
|
|
|
|
args = p.parse_args() |
|
|
|
with open(args.padinfo) as f: |
|
|
|
info = json.load(f) |
|
|
|
|
|
|
|
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) |
|
|
|
|
|
|
|
padlinkpats = [] |
|
|
|
if "padlink" in info: |
|
|
|
if type(info['padlink']) == list: |
|
|
|
padlinkpats.extend(info['padlink']) |
|
|
|
else: |
|
|
|
padlinkpats.append(info['padlink']) |
|
|
|
padlinkpats.extend(args.padlink) |
|
|
|
|
|
|
|
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] |
|
|
|
linkpats.extend(zip(args.linksearch, args.linkreplace)) |
|
|
|
|
|
|
|
if args.verbose: |
|
|
|
print ("using padlinkpats", padlinkpats) |
|
|
|
|
|
|
|
todo = [args.padid] |
|
|
|
done = set() |
|
|
|
count = 0 |
|
|
@ -51,7 +71,9 @@ while len(todo) > 0: |
|
|
|
data['apikey'] = info['apikey'] |
|
|
|
data['padID'] = padid.encode("utf-8") |
|
|
|
|
|
|
|
out = "{0}/{1}".format(args.path, urlify(padid)) |
|
|
|
if args.verbose: |
|
|
|
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) |
|
|
|
out = "{0}/{1}".format(args.output, urlify(padid)) |
|
|
|
print ("{0}".format(out), file=sys.stderr) |
|
|
|
|
|
|
|
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) |
|
|
@ -66,6 +88,9 @@ while len(todo) > 0: |
|
|
|
trim_removed_spans(t) |
|
|
|
html = ET.tostring(t, method="html") |
|
|
|
|
|
|
|
# Stage 1: Process as text |
|
|
|
# Process [[wikilink]] style links |
|
|
|
# and add linked page names to spider todo list |
|
|
|
html, links = linkify(html) |
|
|
|
for l in links: |
|
|
|
if l not in todo and l not in done: |
|
|
@ -73,20 +98,58 @@ while len(todo) > 0: |
|
|
|
print (" link: {0}".format(l), file=sys.stderr) |
|
|
|
todo.append(l) |
|
|
|
|
|
|
|
try: |
|
|
|
os.makedirs(args.path) |
|
|
|
except OSError: |
|
|
|
pass |
|
|
|
with open(out, "w") as f: |
|
|
|
# Stage 2: Process as ElementTree |
|
|
|
# |
|
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
|
# apply linkpats |
|
|
|
for a in t.findall(".//a"): |
|
|
|
href = a.attrib.get("href") |
|
|
|
original_href = href |
|
|
|
if href: |
|
|
|
# if args.verbose: |
|
|
|
# print ("searching for PADLINK: {0}".format(href)) |
|
|
|
for pat in padlinkpats: |
|
|
|
if re.search(pat, href) != None: |
|
|
|
# if args.verbose: |
|
|
|
# print (" found PADLINK: {0}".format(href)) |
|
|
|
href = re.sub(pat, "\\1.html", href) |
|
|
|
padid = filename_to_padid(href) |
|
|
|
set_text_contents(a, "[[{0}]]".format(padid)) |
|
|
|
if padid not in todo and padid not in done: |
|
|
|
if args.verbose: |
|
|
|
print (" link: {0}".format(padid), file=sys.stderr) |
|
|
|
todo.append(padid) |
|
|
|
# apply linkpats |
|
|
|
for s, r in linkpats: |
|
|
|
href = re.sub(s, r, href) |
|
|
|
if href != original_href: |
|
|
|
old_contents = text_contents(a) |
|
|
|
# print ("OLD_CONTENTS {0}".format(old_contents)) |
|
|
|
if old_contents == original_href: |
|
|
|
if args.verbose: |
|
|
|
print (" Updating href IN TEXT", file=sys.stderr) |
|
|
|
set_text_contents(a, href) |
|
|
|
|
|
|
|
if original_href != href: |
|
|
|
if args.verbose: |
|
|
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) |
|
|
|
a.attrib['href'] = href |
|
|
|
|
|
|
|
# extract the style tag (with authorship colors) |
|
|
|
style = t.find(".//style") |
|
|
|
if style != None: |
|
|
|
style = ET.tostring(style, method="html") |
|
|
|
else: |
|
|
|
style = "" |
|
|
|
body = t.find(".//body") |
|
|
|
html = contents(body) |
|
|
|
# and extract the contents of the body |
|
|
|
html = contents(t.find(".//body")) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
os.makedirs(args.output) |
|
|
|
except OSError: |
|
|
|
pass |
|
|
|
with open(out, "w") as f: |
|
|
|
# f.write(html.encode("utf-8")) |
|
|
|
f.write(template.render( |
|
|
|
html = html, |
|
|
|