Browse Source

etherdump working with colors

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
f30aafb5c7
  1. 213
      etherdump
  2. 4
      linkify.py
  3. 13
      templates/pad_html.html

213
etherdump

@ -1,15 +1,17 @@
#!/usr/bin/env python
from __future__ import print_function
from argparse import ArgumentParser
# stdlib
import json, sys, os, re
from argparse import ArgumentParser
from datetime import datetime
import html5lib
from xml.etree import cElementTree as ET
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET
# dependencies
import html5lib, jinja2
# local mods
from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid
import jinja2
def get_template_env (tpath=None):
@ -21,19 +23,34 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader)
return env
p = ArgumentParser("")
p.add_argument("padid", help="the padid")
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json")
p.add_argument("--output", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true")
p = ArgumentParser("""
_ _ _
___| |_| |__ ___ _ __ __| |_ _ _ __ ___ _ __
/ _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \
| __/ |_| | | | __/ | | (_| | |_| | | | | | | |_) |
\___|\__|_| |_|\___|_| \__,_|\__,_|_| |_| |_| .__/
|_|
""")
p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")
p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates")
p.add_argument("--template", default="pad_html.html")
p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads")
p.add_argument("--templatepath", default="templates", help="directory with templates, default: templates")
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
p.add_argument("--css", default="styles.css", help="padid of stylesheet")
args = p.parse_args()
with open(args.padinfo) as f:
@ -41,62 +58,135 @@ with open(args.padinfo) as f:
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
# padlinkpats are for mapping internal pad links
# linkpats are any other link replacements, both are regexps
padlinkpats = []
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
env = get_template_env(args.templatepath)
colors_template = env.get_template(args.colors_template)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid]
todo = args.padid
done = set()
count = 0
data = {}
data['apikey'] = info['apikey']
env = get_template_env(args.templates)
template = env.get_template(args.template)
if args.allpads:
# push the list of all pad names on to todo
list_url = apiurl+'listAllPads?'+urlencode(data)
if args.showurls:
print (list_url, file=sys.stderr)
results = json.load(urlopen(list_url))['data']['padIDs']
todo.extend(results)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8")
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid))
print ("{0}".format(out), file=sys.stderr)
if not args.pretend:
try:
os.makedirs(args.path)
except OSError:
pass
# print ("{0}".format(padid).encode("utf-8"), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data)
total_revisions = json.load(urlopen(total_revisions))['data']['revisions']
if args.verbose:
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr)
# _
# _ __ ___ ___| |_ __ _
# | '_ ` _ \ / _ \ __/ _` |
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
meta = {}
meta['padid'] = padid
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
if args.showurls:
print (revisions_url, file=sys.stderr)
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
if args.showurls:
print (lastedited_url, file=sys.stderr)
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
meta['lastedited_raw'] = lastedited_raw
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
if args.showurls:
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
with open(meta_out, "w") as f:
json.dump(meta, f)
# _ __ __ ___ __
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
if not args.hidepaths:
print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data)
if args.showurls:
print (text_url, file=sys.stderr)
if not args.pretend:
rawText = json.load(urlopen(text_url))['data']['text']
with open(raw_out, "w") as f:
f.write(rawText.encode("utf-8"))
# _ _ _
# | |__ | |_ _ __ ___ | |
# | '_ \| __| '_ ` _ \| |
# | | | | |_| | | | | | |
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html']
colors_url = apiurl+'createDiffHTML?'+urlencode(data)
if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and add linked page names to spider todo list
# and (optionally) add linked page names to spider todo list
html, links = linkify(html)
for l in links:
if l not in todo and l not in done:
if args.verbose:
print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
if args.spider:
for l in links:
if l not in todo and l not in done:
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
@ -135,6 +225,20 @@ while len(todo) > 0:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# SHOWIMAGES : inject img tag for (local) images
if args.add_images:
ext = os.path.splitext(href)[1].lower().lstrip(".")
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a)
print ("Adding img '{0}'".format(href), file=sys.stderr)
img = ET.SubElement(a, "img")
br = ET.SubElement(a, "br")
a.remove(img); a.insert(0, img)
a.remove(br); a.insert(1, br)
img.attrib['src'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
@ -144,21 +248,24 @@ while len(todo) > 0:
# and extract the contents of the body
html = contents(t.find(".//body"))
if not args.pretend:
with open(colors_out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(colors_template.render(
html = html,
style = style,
revision = meta['total_revisions'],
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
try:
os.makedirs(args.output)
except OSError:
pass
with open(out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
# _
# | | ___ ___ _ __
# | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
# |_|\___/ \___/| .__/
# |_|
count += 1
if args.limit and count >= args.limit:
break
break

4
linkify.py

@ -5,8 +5,8 @@ import re, sys
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def urlify (t):
return t.replace(" ", "_") + ".html"
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def filename_to_padid (t):
t = t.replace("_", " ")

13
templates/pad_html.html

@ -1,13 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>{{padid}}</title>
<meta charset="utf-8">
<meta revision="{{revision}}">
<link rel="stylesheet" type="text/css" href="pad.css">
{{ style }}
</head>
<body>
{{ html }}
</body>
</html>
Loading…
Cancel
Save