|
@ -1,15 +1,17 @@ |
|
|
#!/usr/bin/env python |
|
|
#!/usr/bin/env python |
|
|
from __future__ import print_function |
|
|
from __future__ import print_function |
|
|
from argparse import ArgumentParser |
|
|
# stdlib |
|
|
import json, sys, os, re |
|
|
import json, sys, os, re |
|
|
|
|
|
from argparse import ArgumentParser |
|
|
from datetime import datetime |
|
|
from datetime import datetime |
|
|
import html5lib |
|
|
from xml.etree import cElementTree as ET |
|
|
from urllib import urlencode |
|
|
from urllib import urlencode |
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
from urllib2 import urlopen, HTTPError, URLError |
|
|
from xml.etree import cElementTree as ET |
|
|
# dependencies |
|
|
|
|
|
import html5lib, jinja2 |
|
|
|
|
|
# local mods |
|
|
from trim import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
from trim import trim_removed_spans, contents, set_text_contents, text_contents |
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
from linkify import linkify, urlify, filename_to_padid |
|
|
import jinja2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_template_env (tpath=None): |
|
|
def get_template_env (tpath=None): |
|
@ -21,19 +23,34 @@ def get_template_env (tpath=None): |
|
|
env = jinja2.Environment(loader=loader) |
|
|
env = jinja2.Environment(loader=loader) |
|
|
return env |
|
|
return env |
|
|
|
|
|
|
|
|
|
|
|
p = ArgumentParser(""" |
|
|
p = ArgumentParser("") |
|
|
_ _ _ |
|
|
p.add_argument("padid", help="the padid") |
|
|
___| |_| |__ ___ _ __ __| |_ _ _ __ ___ _ __ |
|
|
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") |
|
|
/ _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \ |
|
|
p.add_argument("--output", default="output", help="path to save files, default: output") |
|
|
| __/ |_| | | | __/ | | (_| | |_| | | | | | | |_) | |
|
|
p.add_argument("--verbose", default=False, action="store_true") |
|
|
\___|\__|_| |_|\___|_| \__,_|\__,_|_| |_| |_| .__/ |
|
|
|
|
|
|_| |
|
|
|
|
|
""") |
|
|
|
|
|
p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process") |
|
|
|
|
|
p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json") |
|
|
|
|
|
p.add_argument("--path", default="output", help="path to save files, default: output") |
|
|
|
|
|
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output") |
|
|
p.add_argument("--limit", type=int, default=None) |
|
|
p.add_argument("--limit", type=int, default=None) |
|
|
p.add_argument("--templates", default="templates") |
|
|
p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads") |
|
|
p.add_argument("--template", default="pad_html.html") |
|
|
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads") |
|
|
|
|
|
p.add_argument("--templatepath", default="templates", help="directory with templates, default: templates") |
|
|
|
|
|
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html") |
|
|
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") |
|
|
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") |
|
|
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") |
|
|
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") |
|
|
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") |
|
|
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") |
|
|
|
|
|
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)") |
|
|
|
|
|
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths") |
|
|
|
|
|
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save") |
|
|
|
|
|
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags") |
|
|
|
|
|
|
|
|
|
|
|
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!! |
|
|
|
|
|
p.add_argument("--css", default="styles.css", help="padid of stylesheet") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
args = p.parse_args() |
|
|
args = p.parse_args() |
|
|
with open(args.padinfo) as f: |
|
|
with open(args.padinfo) as f: |
|
@ -41,62 +58,135 @@ with open(args.padinfo) as f: |
|
|
|
|
|
|
|
|
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) |
|
|
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) |
|
|
|
|
|
|
|
|
|
|
|
# padlinkpats are for mapping internal pad links |
|
|
|
|
|
# linkpats are any other link replacements, both are regexps |
|
|
|
|
|
|
|
|
padlinkpats = [] |
|
|
padlinkpats = [] |
|
|
|
|
|
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] |
|
|
|
|
|
linkpats.extend(zip(args.linksearch, args.linkreplace)) |
|
|
if "padlink" in info: |
|
|
if "padlink" in info: |
|
|
if type(info['padlink']) == list: |
|
|
if type(info['padlink']) == list: |
|
|
padlinkpats.extend(info['padlink']) |
|
|
padlinkpats.extend(info['padlink']) |
|
|
else: |
|
|
else: |
|
|
padlinkpats.append(info['padlink']) |
|
|
padlinkpats.append(info['padlink']) |
|
|
padlinkpats.extend(args.padlink) |
|
|
padlinkpats.extend(args.padlink) |
|
|
|
|
|
env = get_template_env(args.templatepath) |
|
|
|
|
|
colors_template = env.get_template(args.colors_template) |
|
|
|
|
|
|
|
|
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] |
|
|
todo = args.padid |
|
|
linkpats.extend(zip(args.linksearch, args.linkreplace)) |
|
|
|
|
|
|
|
|
|
|
|
if args.verbose: |
|
|
|
|
|
print ("using padlinkpats", padlinkpats) |
|
|
|
|
|
|
|
|
|
|
|
todo = [args.padid] |
|
|
|
|
|
done = set() |
|
|
done = set() |
|
|
count = 0 |
|
|
count = 0 |
|
|
|
|
|
data = {} |
|
|
|
|
|
data['apikey'] = info['apikey'] |
|
|
|
|
|
|
|
|
env = get_template_env(args.templates) |
|
|
if args.allpads: |
|
|
template = env.get_template(args.template) |
|
|
# push the list of all pad names on to todo |
|
|
|
|
|
list_url = apiurl+'listAllPads?'+urlencode(data) |
|
|
|
|
|
if args.showurls: |
|
|
|
|
|
print (list_url, file=sys.stderr) |
|
|
|
|
|
results = json.load(urlopen(list_url))['data']['padIDs'] |
|
|
|
|
|
todo.extend(results) |
|
|
|
|
|
|
|
|
while len(todo) > 0: |
|
|
while len(todo) > 0: |
|
|
padid = todo[0] |
|
|
padid = todo[0] |
|
|
todo = todo[1:] |
|
|
todo = todo[1:] |
|
|
done.add(padid) |
|
|
done.add(padid) |
|
|
|
|
|
|
|
|
data = {} |
|
|
|
|
|
data['apikey'] = info['apikey'] |
|
|
|
|
|
data['padID'] = padid.encode("utf-8") |
|
|
data['padID'] = padid.encode("utf-8") |
|
|
|
|
|
|
|
|
if args.verbose: |
|
|
if args.verbose: |
|
|
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) |
|
|
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) |
|
|
out = "{0}/{1}".format(args.output, urlify(padid)) |
|
|
if not args.pretend: |
|
|
print ("{0}".format(out), file=sys.stderr) |
|
|
try: |
|
|
|
|
|
os.makedirs(args.path) |
|
|
|
|
|
except OSError: |
|
|
|
|
|
pass |
|
|
|
|
|
# print ("{0}".format(padid).encode("utf-8"), file=sys.stderr) |
|
|
|
|
|
|
|
|
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) |
|
|
# _ |
|
|
total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] |
|
|
# _ __ ___ ___| |_ __ _ |
|
|
if args.verbose: |
|
|
# | '_ ` _ \ / _ \ __/ _` | |
|
|
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) |
|
|
# | | | | | | __/ || (_| | |
|
|
|
|
|
# |_| |_| |_|\___|\__\__,_| |
|
|
|
|
|
|
|
|
|
|
|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json")) |
|
|
|
|
|
if not args.hidepaths: |
|
|
|
|
|
print (meta_out, file=sys.stderr) |
|
|
|
|
|
if not args.pretend: |
|
|
|
|
|
meta = {} |
|
|
|
|
|
meta['padid'] = padid |
|
|
|
|
|
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data) |
|
|
|
|
|
if args.showurls: |
|
|
|
|
|
print (revisions_url, file=sys.stderr) |
|
|
|
|
|
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions'] |
|
|
|
|
|
|
|
|
|
|
|
lastedited_url = apiurl+'getLastEdited?'+urlencode(data) |
|
|
|
|
|
if args.showurls: |
|
|
|
|
|
print (lastedited_url, file=sys.stderr) |
|
|
|
|
|
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited'] |
|
|
|
|
|
meta['lastedited_raw'] = lastedited_raw |
|
|
|
|
|
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat() |
|
|
|
|
|
|
|
|
|
|
|
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type) |
|
|
|
|
|
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data) |
|
|
|
|
|
if args.showurls: |
|
|
|
|
|
print (authors_url, file=sys.stderr) |
|
|
|
|
|
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] |
|
|
|
|
|
|
|
|
|
|
|
with open(meta_out, "w") as f: |
|
|
|
|
|
json.dump(meta, f) |
|
|
|
|
|
|
|
|
|
|
|
# _ __ __ ___ __ |
|
|
|
|
|
# | '__/ _` \ \ /\ / / |
|
|
|
|
|
# | | | (_| |\ V V / |
|
|
|
|
|
# |_| \__,_| \_/\_/ |
|
|
|
|
|
|
|
|
|
|
|
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt")) |
|
|
|
|
|
if not args.hidepaths: |
|
|
|
|
|
print (raw_out, file=sys.stderr) |
|
|
|
|
|
text_url = apiurl+"getText?"+urlencode(data) |
|
|
|
|
|
if args.showurls: |
|
|
|
|
|
print (text_url, file=sys.stderr) |
|
|
|
|
|
if not args.pretend: |
|
|
|
|
|
rawText = json.load(urlopen(text_url))['data']['text'] |
|
|
|
|
|
with open(raw_out, "w") as f: |
|
|
|
|
|
f.write(rawText.encode("utf-8")) |
|
|
|
|
|
|
|
|
|
|
|
# _ _ _ |
|
|
|
|
|
# | |__ | |_ _ __ ___ | | |
|
|
|
|
|
# | '_ \| __| '_ ` _ \| | |
|
|
|
|
|
# | | | | |_| | | | | | | |
|
|
|
|
|
# |_| |_|\__|_| |_| |_|_| |
|
|
|
|
|
|
|
|
|
|
|
# todo ? -- regular HTML output |
|
|
|
|
|
|
|
|
|
|
|
# _ |
|
|
|
|
|
# ___ ___ | | ___ _ __ ___ |
|
|
|
|
|
# / __/ _ \| |/ _ \| '__/ __| |
|
|
|
|
|
# | (_| (_) | | (_) | | \__ \ |
|
|
|
|
|
# \___\___/|_|\___/|_| |___/ |
|
|
|
|
|
|
|
|
|
|
|
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html")) |
|
|
|
|
|
if not args.hidepaths: |
|
|
|
|
|
print (colors_out, file=sys.stderr) |
|
|
data['startRev'] = "0" |
|
|
data['startRev'] = "0" |
|
|
requesturl = apiurl+'createDiffHTML?'+urlencode(data) |
|
|
colors_url = apiurl+'createDiffHTML?'+urlencode(data) |
|
|
html = json.load(urlopen(requesturl))['data']['html'] |
|
|
if args.showurls: |
|
|
|
|
|
print (colors_url, file=sys.stderr) |
|
|
|
|
|
html = json.load(urlopen(colors_url))['data']['html'] |
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
t = html5lib.parse(html, namespaceHTMLElements=False) |
|
|
trim_removed_spans(t) |
|
|
trim_removed_spans(t) |
|
|
html = ET.tostring(t, method="html") |
|
|
html = ET.tostring(t, method="html") |
|
|
|
|
|
|
|
|
# Stage 1: Process as text |
|
|
# Stage 1: Process as text |
|
|
# Process [[wikilink]] style links |
|
|
# Process [[wikilink]] style links |
|
|
# and add linked page names to spider todo list |
|
|
# and (optionally) add linked page names to spider todo list |
|
|
html, links = linkify(html) |
|
|
html, links = linkify(html) |
|
|
for l in links: |
|
|
if args.spider: |
|
|
if l not in todo and l not in done: |
|
|
for l in links: |
|
|
if args.verbose: |
|
|
if l not in todo and l not in done: |
|
|
print (" link: {0}".format(l), file=sys.stderr) |
|
|
# if args.verbose: |
|
|
todo.append(l) |
|
|
# print (" link: {0}".format(l), file=sys.stderr) |
|
|
|
|
|
todo.append(l) |
|
|
|
|
|
|
|
|
# Stage 2: Process as ElementTree |
|
|
# Stage 2: Process as ElementTree |
|
|
# |
|
|
# |
|
@ -135,6 +225,20 @@ while len(todo) > 0: |
|
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) |
|
|
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) |
|
|
a.attrib['href'] = href |
|
|
a.attrib['href'] = href |
|
|
|
|
|
|
|
|
|
|
|
# SHOWIMAGES : inject img tag for (local) images |
|
|
|
|
|
if args.add_images: |
|
|
|
|
|
ext = os.path.splitext(href)[1].lower().lstrip(".") |
|
|
|
|
|
if ext in ("png", "gif", "jpeg", "jpg"): |
|
|
|
|
|
# ap = _parent(a) |
|
|
|
|
|
print ("Adding img '{0}'".format(href), file=sys.stderr) |
|
|
|
|
|
img = ET.SubElement(a, "img") |
|
|
|
|
|
br = ET.SubElement(a, "br") |
|
|
|
|
|
a.remove(img); a.insert(0, img) |
|
|
|
|
|
a.remove(br); a.insert(1, br) |
|
|
|
|
|
img.attrib['src'] = href |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# extract the style tag (with authorship colors) |
|
|
# extract the style tag (with authorship colors) |
|
|
style = t.find(".//style") |
|
|
style = t.find(".//style") |
|
|
if style != None: |
|
|
if style != None: |
|
@ -144,20 +248,23 @@ while len(todo) > 0: |
|
|
# and extract the contents of the body |
|
|
# and extract the contents of the body |
|
|
html = contents(t.find(".//body")) |
|
|
html = contents(t.find(".//body")) |
|
|
|
|
|
|
|
|
|
|
|
if not args.pretend: |
|
|
|
|
|
with open(colors_out, "w") as f: |
|
|
|
|
|
# f.write(html.encode("utf-8")) |
|
|
|
|
|
f.write(colors_template.render( |
|
|
|
|
|
html = html, |
|
|
|
|
|
style = style, |
|
|
|
|
|
revision = meta['total_revisions'], |
|
|
|
|
|
padid = padid, |
|
|
|
|
|
timestamp = datetime.now() |
|
|
|
|
|
).encode("utf-8")) |
|
|
|
|
|
|
|
|
try: |
|
|
# _ |
|
|
os.makedirs(args.output) |
|
|
# | | ___ ___ _ __ |
|
|
except OSError: |
|
|
# | |/ _ \ / _ \| '_ \ |
|
|
pass |
|
|
# | | (_) | (_) | |_) | |
|
|
with open(out, "w") as f: |
|
|
# |_|\___/ \___/| .__/ |
|
|
# f.write(html.encode("utf-8")) |
|
|
# |_| |
|
|
f.write(template.render( |
|
|
|
|
|
html = html, |
|
|
|
|
|
style = style, |
|
|
|
|
|
revision = total_revisions, |
|
|
|
|
|
padid = padid, |
|
|
|
|
|
timestamp = datetime.now() |
|
|
|
|
|
).encode("utf-8")) |
|
|
|
|
|
|
|
|
|
|
|
count += 1 |
|
|
count += 1 |
|
|
if args.limit and count >= args.limit: |
|
|
if args.limit and count >= args.limit: |
|
|