etherpump/etherdump

#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os, re
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET 
from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
import jinja2


def get_template_env (tpath=None):
    paths = []
    if tpath and os.path.isdir(tpath):
        paths.append(tpath)
    # paths.append(TEMPLATES_PATH)
    loader = jinja2.FileSystemLoader(paths)
    env = jinja2.Environment(loader=loader)
    return env


p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")

p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")

args = p.parse_args()
with open(args.padinfo) as f:
    info = json.load(f)

apiurl =  "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)

padlinkpats = []
if "padlink" in info:
    if type(info['padlink']) == list:
        padlinkpats.extend(info['padlink'])
    else:
        padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)

linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))

if args.verbose:
    print ("using padlinkpats", padlinkpats)

todo = [args.padid]
done = set()
count = 0

env = get_template_env(args.templates)
template = env.get_template(args.template)

while len(todo) > 0:
    padid = todo[0]
    todo = todo[1:]
    done.add(padid)

    data = {}
    data['apikey'] = info['apikey']
    data['padID'] = padid.encode("utf-8")

    if args.verbose:
        print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
    out = "{0}/{1}".format(args.output, urlify(padid))
    print ("{0}".format(out), file=sys.stderr)

    total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
    total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
    if args.verbose:
        print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)

    data['startRev'] = "0"
    requesturl = apiurl+'createDiffHTML?'+urlencode(data)
    html = json.load(urlopen(requesturl))['data']['html']
    t = html5lib.parse(html, namespaceHTMLElements=False)
    trim_removed_spans(t)
    html = ET.tostring(t, method="html")

    # Stage 1: Process as text
    # Process [[wikilink]] style links
    # and add linked page names to spider todo list
    html, links = linkify(html)
    for l in links:
        if l not in todo and l not in done:
            if args.verbose:
                print ("  link: {0}".format(l), file=sys.stderr)
            todo.append(l)

    # Stage 2: Process as ElementTree
    #
    t = html5lib.parse(html, namespaceHTMLElements=False)
    # apply linkpats
    for a in t.findall(".//a"):
        href = a.attrib.get("href")
        original_href = href
        if href:
            # if args.verbose:
            #     print ("searching for PADLINK: {0}".format(href))
            for pat in padlinkpats:
                if re.search(pat, href) != None:
                    # if args.verbose:
                    #     print ("  found PADLINK: {0}".format(href))
                    href = re.sub(pat, "\\1.html", href)
                    padid = filename_to_padid(href)
                    set_text_contents(a, "[[{0}]]".format(padid))
                    if padid not in todo and padid not in done:
                        if args.verbose:
                            print ("  link: {0}".format(padid), file=sys.stderr)
                        todo.append(padid)
            # apply linkpats
            for s, r in linkpats:
                href = re.sub(s, r, href)
                if href != original_href:
                    old_contents = text_contents(a)
                    # print ("OLD_CONTENTS {0}".format(old_contents))
                    if old_contents == original_href:
                        if args.verbose:
                            print ("   Updating href IN TEXT", file=sys.stderr)
                        set_text_contents(a, href)

            if original_href != href:
                if args.verbose:
                    print ("  Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
                a.attrib['href'] = href

    # extract the style tag (with authorship colors)
    style = t.find(".//style")
    if style != None:
        style = ET.tostring(style, method="html")
    else:
        style = ""
    # and extract the contents of the body
    html = contents(t.find(".//body"))


    try:
        os.makedirs(args.output)
    except OSError:
        pass
    with open(out, "w") as f:
        # f.write(html.encode("utf-8"))
        f.write(template.render(
            html = html,
            style = style,
            revision = total_revisions,
            padid = padid,
            timestamp = datetime.now()
        ).encode("utf-8"))

    count += 1
    if args.limit and count >= args.limit:
        break
working on index dumping 2015-02-26 17:15:41 +01:00			`#!/usr/bin/env python`
initial commit 2015-02-26 13:54:26 +01:00			`from __future__ import print_function`
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`from argparse import ArgumentParser`
			`import json, sys, os, re`
working on index dumping 2015-02-26 17:15:41 +01:00			`from datetime import datetime`
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`import html5lib`
			`from urllib import urlencode`
			`from urllib2 import urlopen, HTTPError, URLError`
			`from xml.etree import cElementTree as ET`
			`from trim import trim_removed_spans, contents, set_text_contents, text_contents`
			`from linkify import linkify, urlify, filename_to_padid`
			`import jinja2`
working on index dumping 2015-02-26 17:15:41 +01:00

index 2015-03-05 12:11:21 +01:00			`def get_template_env (tpath=None):`
			`paths = []`
			`if tpath and os.path.isdir(tpath):`
			`paths.append(tpath)`
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`# paths.append(TEMPLATES_PATH)`
index 2015-03-05 12:11:21 +01:00			`loader = jinja2.FileSystemLoader(paths)`
			`env = jinja2.Environment(loader=loader)`
			`return env`
working on index dumping 2015-02-26 17:15:41 +01:00
initial commit 2015-02-26 13:54:26 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`p = ArgumentParser("")`
			`p.add_argument("padid", help="the padid")`
			`p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")`
			`p.add_argument("--output", default="output", help="path to save files, default: output")`
			`p.add_argument("--verbose", default=False, action="store_true")`
			`p.add_argument("--limit", type=int, default=None)`
			`p.add_argument("--templates", default="templates")`
			`p.add_argument("--template", default="pad_html.html")`
initial commit 2015-02-26 13:54:26 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")`
			`p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")`
			`p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")`
initial commit 2015-02-26 13:54:26 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`args = p.parse_args()`
			`with open(args.padinfo) as f:`
			`info = json.load(f)`
working on index dumping 2015-02-26 17:15:41 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)`
updates 2015-03-05 15:10:16 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`padlinkpats = []`
			`if "padlink" in info:`
			`if type(info['padlink']) == list:`
			`padlinkpats.extend(info['padlink'])`
			`else:`
			`padlinkpats.append(info['padlink'])`
			`padlinkpats.extend(args.padlink)`

			`linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]`
			`linkpats.extend(zip(args.linksearch, args.linkreplace))`

			`if args.verbose:`
			`print ("using padlinkpats", padlinkpats)`

			`todo = [args.padid]`
			`done = set()`
			`count = 0`

			`env = get_template_env(args.templates)`
			`template = env.get_template(args.template)`

			`while len(todo) > 0:`
			`padid = todo[0]`
			`todo = todo[1:]`
			`done.add(padid)`

			`data = {}`
			`data['apikey'] = info['apikey']`
			`data['padID'] = padid.encode("utf-8")`

			`if args.verbose:`
			`print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)`
			`out = "{0}/{1}".format(args.output, urlify(padid))`
			`print ("{0}".format(out), file=sys.stderr)`

			`total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)`
			`total_revisions = json.load(urlopen(total_revisions))['data']['revisions']`
			`if args.verbose:`
			`print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)`

			`data['startRev'] = "0"`
			`requesturl = apiurl+'createDiffHTML?'+urlencode(data)`
			`html = json.load(urlopen(requesturl))['data']['html']`
			`t = html5lib.parse(html, namespaceHTMLElements=False)`
			`trim_removed_spans(t)`
			`html = ET.tostring(t, method="html")`

			`# Stage 1: Process as text`
			`# Process [[wikilink]] style links`
			`# and add linked page names to spider todo list`
			`html, links = linkify(html)`
			`for l in links:`
			`if l not in todo and l not in done:`
			`if args.verbose:`
			`print (" link: {0}".format(l), file=sys.stderr)`
			`todo.append(l)`

			`# Stage 2: Process as ElementTree`
			`#`
			`t = html5lib.parse(html, namespaceHTMLElements=False)`
			`# apply linkpats`
			`for a in t.findall(".//a"):`
			`href = a.attrib.get("href")`
			`original_href = href`
			`if href:`
			`# if args.verbose:`
			`# print ("searching for PADLINK: {0}".format(href))`
			`for pat in padlinkpats:`
			`if re.search(pat, href) != None:`
			`# if args.verbose:`
			`# print (" found PADLINK: {0}".format(href))`
			`href = re.sub(pat, "\\1.html", href)`
			`padid = filename_to_padid(href)`
			`set_text_contents(a, "[[{0}]]".format(padid))`
			`if padid not in todo and padid not in done:`
			`if args.verbose:`
			`print (" link: {0}".format(padid), file=sys.stderr)`
			`todo.append(padid)`
			`# apply linkpats`
			`for s, r in linkpats:`
			`href = re.sub(s, r, href)`
			`if href != original_href:`
			`old_contents = text_contents(a)`
			`# print ("OLD_CONTENTS {0}".format(old_contents))`
			`if old_contents == original_href:`
			`if args.verbose:`
			`print (" Updating href IN TEXT", file=sys.stderr)`
			`set_text_contents(a, href)`

			`if original_href != href:`
			`if args.verbose:`
			`print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)`
			`a.attrib['href'] = href`

			`# extract the style tag (with authorship colors)`
			`style = t.find(".//style")`
			`if style != None:`
			`style = ET.tostring(style, method="html")`
			`else:`
			`style = ""`
			`# and extract the contents of the body`
			`html = contents(t.find(".//body"))`


			`try:`
			`os.makedirs(args.output)`
			`except OSError:`
			`pass`
			`with open(out, "w") as f:`
			`# f.write(html.encode("utf-8"))`
			`f.write(template.render(`
			`html = html,`
			`style = style,`
			`revision = total_revisions,`
			`padid = padid,`
updates 2015-03-05 15:10:16 +01:00			`timestamp = datetime.now()`
index 2015-03-05 12:11:21 +01:00			`).encode("utf-8"))`
working on index dumping 2015-02-26 17:15:41 +01:00
moved html / history dump to main etherdump 2015-08-24 16:06:50 +02:00			`count += 1`
			`if args.limit and count >= args.limit:`
			`break`