etherpump/etherdump

165 lines
5.7 KiB
Plaintext
Raw Normal View History

2015-02-26 17:15:41 +01:00
#!/usr/bin/env python
2015-02-26 13:54:26 +01:00
from __future__ import print_function
from argparse import ArgumentParser
import json, sys, os, re
2015-02-26 17:15:41 +01:00
from datetime import datetime
import html5lib
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
import jinja2
2015-02-26 17:15:41 +01:00
2015-03-05 12:11:21 +01:00
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
2015-03-05 12:11:21 +01:00
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
2015-02-26 17:15:41 +01:00
2015-02-26 13:54:26 +01:00
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
2015-02-26 13:54:26 +01:00
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
2015-02-26 13:54:26 +01:00
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
2015-02-26 17:15:41 +01:00
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
2015-03-05 15:10:16 +01:00
padlinkpats = []
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid]
done = set()
count = 0
env = get_template_env(args.templates)
template = env.get_template(args.template)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
try:
os.makedirs(args.output)
except OSError:
pass
with open(out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
2015-03-05 15:10:16 +01:00
timestamp = datetime.now()
2015-03-05 12:11:21 +01:00
).encode("utf-8"))
2015-02-26 17:15:41 +01:00
count += 1
if args.limit and count >= args.limit:
break