#!/usr/bin/env python from __future__ import print_function from argparse import ArgumentParser import json, sys, os, re from datetime import datetime import html5lib from urllib import urlencode from urllib2 import urlopen, HTTPError, URLError from xml.etree import cElementTree as ET from trim import trim_removed_spans, contents, set_text_contents, text_contents from linkify import linkify, urlify, filename_to_padid import jinja2 def get_template_env (tpath=None): paths = [] if tpath and os.path.isdir(tpath): paths.append(tpath) # paths.append(TEMPLATES_PATH) loader = jinja2.FileSystemLoader(paths) env = jinja2.Environment(loader=loader) return env p = ArgumentParser("") p.add_argument("padid", help="the padid") p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") p.add_argument("--output", default="output", help="path to save files, default: output") p.add_argument("--verbose", default=False, action="store_true") p.add_argument("--limit", type=int, default=None) p.add_argument("--templates", default="templates") p.add_argument("--template", default="pad_html.html") p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") args = p.parse_args() with open(args.padinfo) as f: info = json.load(f) apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) padlinkpats = [] if "padlink" in info: if type(info['padlink']) == list: padlinkpats.extend(info['padlink']) else: padlinkpats.append(info['padlink']) padlinkpats.extend(args.padlink) linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] linkpats.extend(zip(args.linksearch, args.linkreplace)) if args.verbose: print ("using padlinkpats", padlinkpats) todo = [args.padid] done = set() count = 0 env = get_template_env(args.templates) template = env.get_template(args.template) while len(todo) > 0: padid = todo[0] todo = todo[1:] done.add(padid) data = {} data['apikey'] = info['apikey'] data['padID'] = padid.encode("utf-8") if args.verbose: print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) out = "{0}/{1}".format(args.output, urlify(padid)) print ("{0}".format(out), file=sys.stderr) total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] if args.verbose: print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) data['startRev'] = "0" requesturl = apiurl+'createDiffHTML?'+urlencode(data) html = json.load(urlopen(requesturl))['data']['html'] t = html5lib.parse(html, namespaceHTMLElements=False) trim_removed_spans(t) html = ET.tostring(t, method="html") # Stage 1: Process as text # Process [[wikilink]] style links # and add linked page names to spider todo list html, links = linkify(html) for l in links: if l not in todo and l not in done: if args.verbose: print (" link: {0}".format(l), file=sys.stderr) todo.append(l) # Stage 2: Process as ElementTree # t = html5lib.parse(html, namespaceHTMLElements=False) # apply linkpats for a in t.findall(".//a"): href = a.attrib.get("href") original_href = href if href: # if args.verbose: # print ("searching for PADLINK: {0}".format(href)) for pat in padlinkpats: if re.search(pat, href) != None: # if args.verbose: # print (" found PADLINK: {0}".format(href)) href = re.sub(pat, "\\1.html", href) padid = filename_to_padid(href) set_text_contents(a, "[[{0}]]".format(padid)) if padid not in todo and padid not in done: if args.verbose: print (" link: {0}".format(padid), file=sys.stderr) todo.append(padid) # apply linkpats for s, r in linkpats: href = re.sub(s, r, href) if href != original_href: old_contents = text_contents(a) # print ("OLD_CONTENTS {0}".format(old_contents)) if old_contents == original_href: if args.verbose: print (" Updating href IN TEXT", file=sys.stderr) set_text_contents(a, href) if original_href != href: if args.verbose: print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) a.attrib['href'] = href # extract the style tag (with authorship colors) style = t.find(".//style") if style != None: style = ET.tostring(style, method="html") else: style = "" # and extract the contents of the body html = contents(t.find(".//body")) try: os.makedirs(args.output) except OSError: pass with open(out, "w") as f: # f.write(html.encode("utf-8")) f.write(template.render( html = html, style = style, revision = total_revisions, padid = padid, timestamp = datetime.now() ).encode("utf-8")) count += 1 if args.limit and count >= args.limit: break