Browse Source

etherdump working with colors

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
f30aafb5c7
  1. 213
      etherdump
  2. 4
      linkify.py
  3. 13
      templates/pad_html.html

213
etherdump

@ -1,15 +1,17 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import print_function from __future__ import print_function
from argparse import ArgumentParser # stdlib
import json, sys, os, re import json, sys, os, re
from argparse import ArgumentParser
from datetime import datetime from datetime import datetime
import html5lib from xml.etree import cElementTree as ET
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
from xml.etree import cElementTree as ET # dependencies
import html5lib, jinja2
# local mods
from trim import trim_removed_spans, contents, set_text_contents, text_contents from trim import trim_removed_spans, contents, set_text_contents, text_contents
from linkify import linkify, urlify, filename_to_padid from linkify import linkify, urlify, filename_to_padid
import jinja2
def get_template_env (tpath=None): def get_template_env (tpath=None):
@ -21,19 +23,34 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader) env = jinja2.Environment(loader=loader)
return env return env
p = ArgumentParser("""
p = ArgumentParser("") _ _ _
p.add_argument("padid", help="the padid") ___| |_| |__ ___ _ __ __| |_ _ _ __ ___ _ __
p.add_argument("--padinfo", default="padinfo.json", help="padinfo, default: padinfo.json") / _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \
p.add_argument("--output", default="output", help="path to save files, default: output") | __/ |_| | | | __/ | | (_| | |_| | | | | | | |_) |
p.add_argument("--verbose", default=False, action="store_true") \___|\__|_| |_|\___|_| \__,_|\__,_|_| |_| |_| .__/
|_|
""")
p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")
p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
p.add_argument("--limit", type=int, default=None) p.add_argument("--limit", type=int, default=None)
p.add_argument("--templates", default="templates") p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
p.add_argument("--template", default="pad_html.html") p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads")
p.add_argument("--templatepath", default="templates", help="directory with templates, default: templates")
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for") p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch") p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
p.add_argument("--css", default="styles.css", help="padid of stylesheet")
args = p.parse_args() args = p.parse_args()
with open(args.padinfo) as f: with open(args.padinfo) as f:
@ -41,62 +58,135 @@ with open(args.padinfo) as f:
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info) apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
# padlinkpats are for mapping internal pad links
# linkpats are any other link replacements, both are regexps
padlinkpats = [] padlinkpats = []
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if "padlink" in info: if "padlink" in info:
if type(info['padlink']) == list: if type(info['padlink']) == list:
padlinkpats.extend(info['padlink']) padlinkpats.extend(info['padlink'])
else: else:
padlinkpats.append(info['padlink']) padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink) padlinkpats.extend(args.padlink)
env = get_template_env(args.templatepath)
colors_template = env.get_template(args.colors_template)
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats] todo = args.padid
linkpats.extend(zip(args.linksearch, args.linkreplace))
if args.verbose:
print ("using padlinkpats", padlinkpats)
todo = [args.padid]
done = set() done = set()
count = 0 count = 0
data = {}
data['apikey'] = info['apikey']
env = get_template_env(args.templates) if args.allpads:
template = env.get_template(args.template) # push the list of all pad names on to todo
list_url = apiurl+'listAllPads?'+urlencode(data)
if args.showurls:
print (list_url, file=sys.stderr)
results = json.load(urlopen(list_url))['data']['padIDs']
todo.extend(results)
while len(todo) > 0: while len(todo) > 0:
padid = todo[0] padid = todo[0]
todo = todo[1:] todo = todo[1:]
done.add(padid) done.add(padid)
data = {}
data['apikey'] = info['apikey']
data['padID'] = padid.encode("utf-8") data['padID'] = padid.encode("utf-8")
if args.verbose: if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
out = "{0}/{1}".format(args.output, urlify(padid)) if not args.pretend:
print ("{0}".format(out), file=sys.stderr) try:
os.makedirs(args.path)
except OSError:
pass
# print ("{0}".format(padid).encode("utf-8"), file=sys.stderr)
total_revisions = apiurl+'getRevisionsCount?'+urlencode(data) # _
total_revisions = json.load(urlopen(total_revisions))['data']['revisions'] # _ __ ___ ___| |_ __ _
if args.verbose: # | '_ ` _ \ / _ \ __/ _` |
print (u"{0} has {1} total revisions...".format(padid, total_revisions).encode("utf-8"), file=sys.stderr) # | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_out = "{0}/{1}".format(args.path, urlify(padid, ext=".json"))
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
meta = {}
meta['padid'] = padid
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
if args.showurls:
print (revisions_url, file=sys.stderr)
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
if args.showurls:
print (lastedited_url, file=sys.stderr)
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
meta['lastedited_raw'] = lastedited_raw
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
if args.showurls:
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
with open(meta_out, "w") as f:
json.dump(meta, f)
# _ __ __ ___ __
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
raw_out = "{0}/{1}".format(args.path, urlify(padid, ext=".txt"))
if not args.hidepaths:
print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data)
if args.showurls:
print (text_url, file=sys.stderr)
if not args.pretend:
rawText = json.load(urlopen(text_url))['data']['text']
with open(raw_out, "w") as f:
f.write(rawText.encode("utf-8"))
# _ _ _
# | |__ | |_ _ __ ___ | |
# | '_ \| __| '_ ` _ \| |
# | | | | |_| | | | | | |
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
colors_out = "{0}/{1}".format(args.path, urlify(padid, ext=".html"))
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0" data['startRev'] = "0"
requesturl = apiurl+'createDiffHTML?'+urlencode(data) colors_url = apiurl+'createDiffHTML?'+urlencode(data)
html = json.load(urlopen(requesturl))['data']['html'] if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False) t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t) trim_removed_spans(t)
html = ET.tostring(t, method="html") html = ET.tostring(t, method="html")
# Stage 1: Process as text # Stage 1: Process as text
# Process [[wikilink]] style links # Process [[wikilink]] style links
# and add linked page names to spider todo list # and (optionally) add linked page names to spider todo list
html, links = linkify(html) html, links = linkify(html)
for l in links: if args.spider:
if l not in todo and l not in done: for l in links:
if args.verbose: if l not in todo and l not in done:
print (" link: {0}".format(l), file=sys.stderr) # if args.verbose:
todo.append(l) # print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree # Stage 2: Process as ElementTree
# #
@ -135,6 +225,20 @@ while len(todo) > 0:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href a.attrib['href'] = href
# SHOWIMAGES : inject img tag for (local) images
if args.add_images:
ext = os.path.splitext(href)[1].lower().lstrip(".")
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a)
print ("Adding img '{0}'".format(href), file=sys.stderr)
img = ET.SubElement(a, "img")
br = ET.SubElement(a, "br")
a.remove(img); a.insert(0, img)
a.remove(br); a.insert(1, br)
img.attrib['src'] = href
# extract the style tag (with authorship colors) # extract the style tag (with authorship colors)
style = t.find(".//style") style = t.find(".//style")
if style != None: if style != None:
@ -144,21 +248,24 @@ while len(todo) > 0:
# and extract the contents of the body # and extract the contents of the body
html = contents(t.find(".//body")) html = contents(t.find(".//body"))
if not args.pretend:
with open(colors_out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(colors_template.render(
html = html,
style = style,
revision = meta['total_revisions'],
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
try: # _
os.makedirs(args.output) # | | ___ ___ _ __
except OSError: # | |/ _ \ / _ \| '_ \
pass # | | (_) | (_) | |_) |
with open(out, "w") as f: # |_|\___/ \___/| .__/
# f.write(html.encode("utf-8")) # |_|
f.write(template.render(
html = html,
style = style,
revision = total_revisions,
padid = padid,
timestamp = datetime.now()
).encode("utf-8"))
count += 1 count += 1
if args.limit and count >= args.limit: if args.limit and count >= args.limit:
break break

4
linkify.py

@ -5,8 +5,8 @@ import re, sys
def strip_tags (text): def strip_tags (text):
return re.sub(r"<.*?>", "", text) return re.sub(r"<.*?>", "", text)
def urlify (t): def urlify (t, ext=".html"):
return t.replace(" ", "_") + ".html" return t.replace(" ", "_") + ext
def filename_to_padid (t): def filename_to_padid (t):
t = t.replace("_", " ") t = t.replace("_", " ")

13
templates/pad_html.html

@ -1,13 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>{{padid}}</title>
<meta charset="utf-8">
<meta revision="{{revision}}">
<link rel="stylesheet" type="text/css" href="pad.css">
{{ style }}
</head>
<body>
{{ html }}
</body>
</html>
Loading…
Cancel
Save