Browse Source

reorg into proper package with bin script

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
52e61d1b7c
  1. 4
      .gitignore
  2. 7
      bin/etherdump
  3. 379
      etherdump.py
  4. 384
      etherdump/__init__.py
  5. 0
      etherdump/data/templates/index.html
  6. 0
      etherdump/data/templates/pad.html
  7. 0
      etherdump/data/templates/pad_colors.html
  8. 413
      etherdump_original
  9. 83
      padserver.py

4
.gitignore

@ -1,7 +1,7 @@
venv/
sites/
build/
*.pyc
*~
venv/
padinfo.json
*.json
!padinfo.sample.json

7
bin/etherdump

@ -0,0 +1,7 @@
#!/usr/bin/env python
from etherdump import main
main()

379
etherdump.py

@ -1,379 +0,0 @@
#!/usr/bin/env python
# License: AGPL
#
from __future__ import print_function
# stdlib
import json, sys, os, re
from argparse import ArgumentParser
from datetime import datetime
from xml.etree import cElementTree as ET
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
# external dependencies (use pip to install these)
import html5lib, jinja2
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def normalize_pad_name (n):
if '?' in n:
n = n.split('?', 1)[0]
if '/' in n:
n = n.split('/', 1)[0]
return n
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = strip_tags(m.group(1))
contents = normalize_pad_name(contents)
collect.append(contents)
link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
p.remove(elt)
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
else:
break
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
p = ArgumentParser("""
_ _ _
___| |_| |__ ___ _ __ __| |_ _ _ __ ___ _ __
/ _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \
| __/ |_| | | | __/ | | (_| | |_| | | | | | | |_) |
\___|\__|_| |_|\___|_| \__,_|\__,_|_| |_| |_| .__/
|_|
""")
p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")
p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads")
p.add_argument("--templatepath", default="templates", help="directory with templates, default: templates")
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
p.add_argument("--authors-css", default="authors.css", help="filename to save collected authorship css (nb: etherdump will overwrite this file!)")
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# p.add_argument("--css", default="styles.css", help="padid of stylesheet")
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
# padlinkpats are for mapping internal pad links
# linkpats are any other link replacements, both are regexps
padlinkpats = []
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
env = get_template_env(args.templatepath)
colors_template = env.get_template(args.colors_template)
todo = args.padid
done = set()
count = 0
data = {}
authors_css_rules = {}
data['apikey'] = info['apikey']
if args.allpads:
# push the list of all pad names on to todo
list_url = apiurl+'listAllPads?'+urlencode(data)
if args.showurls:
print (list_url, file=sys.stderr)
results = json.load(urlopen(list_url))['data']['padIDs']
todo.extend(results)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data['padID'] = padid.encode("utf-8")
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
if not args.pretend:
try:
os.makedirs(args.path)
except OSError:
pass
try:
# _
# _ __ ___ ___| |_ __ _
# | '_ ` _ \ / _ \ __/ _` |
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_url = urlify(padid, ext=".json")
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_url = urlify(padid, ext=".txt")
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
colors_url = urlify(padid, ext=".html")
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
meta = {}
meta['padid'] = padid
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
if args.showurls:
print (revisions_url, file=sys.stderr)
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
if args.showurls:
print (lastedited_url, file=sys.stderr)
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
meta['lastedited_raw'] = lastedited_raw
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
if args.showurls:
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
meta['colors'] = colors_url
meta['raw'] = raw_url
meta['meta'] = meta_url
with open(meta_out, "w") as f:
json.dump(meta, f)
# _ __ __ ___ __
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
if not args.hidepaths:
print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data)
if args.showurls:
print (text_url, file=sys.stderr)
if not args.pretend:
rawText = json.load(urlopen(text_url))['data']['text']
with open(raw_out, "w") as f:
f.write(rawText.encode("utf-8"))
# _ _ _
# | |__ | |_ _ __ ___ | |
# | '_ \| __| '_ ` _ \| |
# | | | | |_| | | | | | |
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0"
colors_url = apiurl+'createDiffHTML?'+urlencode(data)
if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
html, links = linkify(html)
if args.spider:
for l in links:
if l not in todo and l not in done:
if l.startswith("http://") or l.startswith("https://"):
if args.verbose:
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# SHOWIMAGES : inject img tag for (local) images
if args.add_images:
ext = os.path.splitext(href)[1].lower().lstrip(".")
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a)
print ("Adding img '{0}'".format(href), file=sys.stderr)
img = ET.SubElement(a, "img")
br = ET.SubElement(a, "br")
a.remove(img); a.insert(0, img)
a.remove(br); a.insert(1, br)
img.attrib['src'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
if args.authors_css:
for i in style.text.splitlines():
if len(i):
selector, rule = i.split(' ',1)
authors_css_rules[selector] = rule
style = '' # strip the individual style tag from each page (only exports to authors-css file)
# nb: it's up to the template to refer to the authors-css file
else:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
if not args.pretend:
with open(colors_out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(colors_template.render(
html = html,
style = style,
revision = meta['total_revisions'],
padid = padid,
timestamp = datetime.now(),
meta_url = meta_url,
raw_url = raw_url,
colors_url = colors_url,
lastedited = meta['lastedited']
).encode("utf-8"))
# _
# | | ___ ___ _ __
# | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
# |_|\___/ \___/| .__/
# |_|
count += 1
if args.limit and count >= args.limit:
break
except TypeError:
print ("ERROR, skipping!", file=sys.stderr)
# Write the unified CSS with authors
if args.authors_css:
with open(args.authors_css, 'w') as css:
for selector, rule in sorted(authors_css_rules.items()):
css.write(selector+' '+rule+'\n')

384
etherdump/__init__.py

@ -0,0 +1,384 @@
#!/usr/bin/env python
# License: AGPL
#
from __future__ import print_function
# stdlib
import json, sys, os, re
from argparse import ArgumentParser
from datetime import datetime
from xml.etree import cElementTree as ET
from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError
# external dependencies (use pip to install these)
import html5lib, jinja2
DATAPATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
def filename_to_padid (t):
t = t.replace("_", " ")
t = re.sub(r"\.html$", "", t)
return t
def normalize_pad_name (n):
if '?' in n:
n = n.split('?', 1)[0]
if '/' in n:
n = n.split('/', 1)[0]
return n
def urlify (t, ext=".html"):
return t.replace(" ", "_") + ext
def linkify (src, urlify=urlify):
collect = []
def s (m):
contents = strip_tags(m.group(1))
contents = normalize_pad_name(contents)
collect.append(contents)
link = urlify(contents)
# link = link.split("?", 1)[0]
return "[[<a class=\"wikilink\" href=\"{0}\">{1}</a>]]".format(link, contents)
# src = re.sub(r"\[\[([\w_\- ,]+?)\]\]", s, src)
## question marks are ignored by etherpad, so split/strip it
## strip slashes as well!! (/timeslider)
src = re.sub(r"\[\[(.+?)\]\]", s, src)
return (src, collect)
def strip_tags (text):
return re.sub(r"<.*?>", "", text)
def set_text_contents (element, text):
""" ok this isn't really general, but works for singly wrapped elements """
while len(element) == 1:
element = element[0]
element.text = text
def text_contents (element):
return (element.text or '') + ''.join([text_contents(c) for c in element]) + (element.tail or '')
def contents (element, method="html"):
return (element.text or '') + ''.join([ET.tostring(c, method=method) for c in element])
def get_parent(tree, elt):
for parent in tree.iter():
for child in parent:
if child == elt:
return parent
def remove_recursive (tree, elt):
""" Remove element and (any resulting) empty containing elements """
p = get_parent(tree, elt)
if p:
p.remove(elt)
if len(p) == 0 and (p.text == None or p.text.strip() == ""):
# print ("empty parent", p, file=sys.stderr)
remove_recursive(tree, p)
def trim_removed_spans (t):
# remove <span class="removed"> and empty parents
for n in t.findall(".//span[@class='removed']"):
remove_recursive(t, n)
# then strip any leading br's from body
while True:
tag = t.find("./body")[0]
if tag.tag == "br":
remove_recursive(t, tag)
else:
break
def get_template_env (tpath=None):
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
# paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
def main():
p = ArgumentParser("""
_ _ _
___| |_| |__ ___ _ __ __| |_ _ _ __ ___ _ __
/ _ \ __| '_ \ / _ \ '__/ _` | | | | '_ ` _ \| '_ \
| __/ |_| | | | __/ | | (_| | |_| | | | | | | |_) |
\___|\__|_| |_|\___|_| \__,_|\__,_|_| |_| |_| .__/
|_|
""")
p.add_argument("padid", default=[], nargs="*", help="the padid(s) to process")
p.add_argument("--padinfo", default="padinfo.json", help="JSON file with login data for the pad (url, apikey etc), default: padinfo.json")
p.add_argument("--path", default="output", help="path to save files, default: output")
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
p.add_argument("--limit", type=int, default=None)
p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads")
p.add_argument("--templatepath", default=os.path.join(DATAPATH, "templates"), help="directory with templates (override default files)")
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
p.add_argument("--linksearch", default=[], action="append", help="specify a link pattern to search for")
p.add_argument("--linkreplace", default=[], action="append", help="specify a replacement pattern to replace preceding linksearch")
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
p.add_argument("--authors-css", default="authors.css", help="filename to save collected authorship css (nb: etherdump will overwrite this file!)")
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# p.add_argument("--css", default="styles.css", help="padid of stylesheet")
args = p.parse_args()
with open(args.padinfo) as f:
info = json.load(f)
apiurl = "{0[protocol]}://{0[hostname]}:{0[port]}{0[apiurl]}{0[apiversion]}/".format(info)
# padlinkpats are for mapping internal pad links
# linkpats are any other link replacements, both are regexps
padlinkpats = []
linkpats = [] # [(pat, "\\1.html") for pat in padlinkpats]
linkpats.extend(zip(args.linksearch, args.linkreplace))
if "padlink" in info:
if type(info['padlink']) == list:
padlinkpats.extend(info['padlink'])
else:
padlinkpats.append(info['padlink'])
padlinkpats.extend(args.padlink)
env = get_template_env(args.templatepath)
colors_template = env.get_template(args.colors_template)
todo = args.padid
done = set()
count = 0
data = {}
authors_css_rules = {}
data['apikey'] = info['apikey']
if args.allpads:
# push the list of all pad names on to todo
list_url = apiurl+'listAllPads?'+urlencode(data)
if args.showurls:
print (list_url, file=sys.stderr)
results = json.load(urlopen(list_url))['data']['padIDs']
todo.extend(results)
while len(todo) > 0:
padid = todo[0]
todo = todo[1:]
done.add(padid)
data['padID'] = padid.encode("utf-8")
if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
if not args.pretend:
try:
os.makedirs(args.path)
except OSError:
pass
try:
# _
# _ __ ___ ___| |_ __ _
# | '_ ` _ \ / _ \ __/ _` |
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_url = urlify(padid, ext=".json")
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_url = urlify(padid, ext=".txt")
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
colors_url = urlify(padid, ext=".html")
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
meta = {}
meta['padid'] = padid
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
if args.showurls:
print (revisions_url, file=sys.stderr)
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
if args.showurls:
print (lastedited_url, file=sys.stderr)
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
meta['lastedited_raw'] = lastedited_raw
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
if args.showurls:
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
meta['colors'] = colors_url
meta['raw'] = raw_url
meta['meta'] = meta_url
with open(meta_out, "w") as f:
json.dump(meta, f)
# _ __ __ ___ __
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V /
# |_| \__,_| \_/\_/
if not args.hidepaths:
print (raw_out, file=sys.stderr)
text_url = apiurl+"getText?"+urlencode(data)
if args.showurls:
print (text_url, file=sys.stderr)
if not args.pretend:
rawText = json.load(urlopen(text_url))['data']['text']
with open(raw_out, "w") as f:
f.write(rawText.encode("utf-8"))
# _ _ _
# | |__ | |_ _ __ ___ | |
# | '_ \| __| '_ ` _ \| |
# | | | | |_| | | | | | |
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0"
colors_url = apiurl+'createDiffHTML?'+urlencode(data)
if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
html, links = linkify(html)
if args.spider:
for l in links:
if l not in todo and l not in done:
if l.startswith("http://") or l.startswith("https://"):
if args.verbose:
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# SHOWIMAGES : inject img tag for (local) images
if args.add_images:
ext = os.path.splitext(href)[1].lower().lstrip(".")
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a)
print ("Adding img '{0}'".format(href), file=sys.stderr)
img = ET.SubElement(a, "img")
br = ET.SubElement(a, "br")
a.remove(img); a.insert(0, img)
a.remove(br); a.insert(1, br)
img.attrib['src'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
if args.authors_css:
for i in style.text.splitlines():
if len(i):
selector, rule = i.split(' ',1)
authors_css_rules[selector] = rule
style = '' # strip the individual style tag from each page (only exports to authors-css file)
# nb: it's up to the template to refer to the authors-css file
else:
style = ET.tostring(style, method="html")
else:
style = ""
# and extract the contents of the body
html = contents(t.find(".//body"))
if not args.pretend:
with open(colors_out, "w") as f:
# f.write(html.encode("utf-8"))
f.write(colors_template.render(
html = html,
style = style,
revision = meta['total_revisions'],
padid = padid,
timestamp = datetime.now(),
meta_url = meta_url,
raw_url = raw_url,
colors_url = colors_url,
lastedited = meta['lastedited']
).encode("utf-8"))
# _
# | | ___ ___ _ __
# | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
# |_|\___/ \___/| .__/
# |_|
count += 1
if args.limit and count >= args.limit:
break
except TypeError:
print ("ERROR, skipping!", file=sys.stderr)
# Write the unified CSS with authors
if args.authors_css:
with open(args.authors_css, 'w') as css:
for selector, rule in sorted(authors_css_rules.items()):
css.write(selector+' '+rule+'\n')

0
templates/index.html → etherdump/data/templates/index.html

0
templates/pad.html → etherdump/data/templates/pad.html

0
templates/pad_colors.html → etherdump/data/templates/pad_colors.html

413
etherdump_original

@ -1,413 +0,0 @@
#!/usr/bin/env python
from __future__ import print_function
import sys, argparse, json, re, os, time
from urllib2 import urlopen, HTTPError, URLError
import html5lib, urllib2, urllib
from xml.etree import ElementTree as ET
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
from padserver import PadServer
PADINFO_DEFAULTS = {
"hostname": "",
"port": 9001,
"apiversion": "1.2.9",
"apiurl": "/api/"
}
MODULE_PATH = (os.path.dirname(__file__))
TEMPLATES_PATH = os.path.join(MODULE_PATH, "templates")
verbose = False
def pad_split_group (n):
m = re.match(r"g\.(\w+)\$(.+)$", n)
if m:
return m.groups()
else:
return ('', n)
def content(tag):
if tag.text == None:
return u''.join(ET.tostring(e) for e in tag)
else:
return tag.text + u''.join(ET.tostring(e) for e in tag)
def get_template_env (tpath=None):
import jinja2
paths = []
if tpath and os.path.isdir(tpath):
paths.append(tpath)
paths.append(TEMPLATES_PATH)
loader = jinja2.FileSystemLoader(paths)
env = jinja2.Environment(loader=loader)
return env
# template = env.get_template('pad.html')
# print template.render(the='variables', go='here').encode("utf-8")
def dumpPads (padserver, padids, outputpath, pub_path, group_path, sleeptime=0.01, force=False, templates=None, groupinfo=None):
template_env = get_template_env(templates)
pad_template = template_env.get_template("pad.html")
numpads = len(padids)
for i, padid in enumerate(padids):
group_id, pad_name = pad_split_group(padid)
if group_id:
try:
os.mkdir(os.path.join(outputpath, group_path))
except OSError:
pass
try:
os.mkdir(os.path.join(outputpath, group_path, group_id))
except OSError:
pass
fp = os.path.join(outputpath, group_path, group_id, pad_name)
else:
try:
os.mkdir(os.path.join(outputpath, pub_path))
except OSError:
pass
fp = os.path.join(outputpath, pub_path, pad_name)
if verbose:
print (u"Saving to {0}".format(fp).encode("utf-8"), file=sys.stderr)
else:
sys.stderr.write("\rDumping pads... [{0}/{1}]".format(i+1, numpads))
sys.stderr.flush()
textpath = fp + ".txt"
htmlpath = fp+".html"
metapath = fp+".json"
last_edited = padserver.getPadLastEdited(padid)
if last_edited:
last_edited = last_edited.isoformat()
else:
last_edited = ''
if os.path.exists(metapath):
with open(metapath) as f:
meta = json.load(f)
if not force and meta.get("last_edited") and meta.get("last_edited") == last_edited:
if verbose:
print("Up to date, skipping", file=sys.stderr)
continue
meta = {
'pad_id': padid,
'group_id': group_id,
'pad_name': pad_name
}
meta['last_edited'] = last_edited
# Write Text
with open(textpath, "w") as f:
try:
text = padserver.getPadText(padid)
f.write(text.encode("utf-8"))
meta['text_path'] = os.path.relpath(textpath, outputpath)
meta['text_length'] = len(text)
meta['text_length_human'] = humanize_bytes(meta['text_length'])
except (TypeError, HTTPError, ValueError) as e:
print(u"Warning: unable to load text for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(htmlpath, "w") as f:
html = padserver.getPadHTML(padid)
meta['html_path'] = os.path.relpath(htmlpath, outputpath)
meta['html_length'] = len(html)
if pad_template:
t = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False)
body = t.find(".//body")
title = padid
editurl = padserver.getPadURL(padid, groupinfo)
meta['url'] = editurl
json_dump = json.dumps(meta)
f.write(pad_template.render(
body=content(body),
title=title,
editurl=editurl,
sourceurl=textpath,
metadata_json=json_dump).encode("utf-8")) # unicode error HERE!
else:
f.write(html.encode("utf-8"))
# except (TypeError, HTTPError, ValueError) as e:
# print(u"Warning: unable to load HTML for pad {0}, {1}".format(padid, e).encode("utf-8"), file=sys.stderr)
with open(metapath, "w") as f:
f.write(json.dumps(meta))
if sleeptime:
time.sleep(sleeptime)
if not verbose:
sys.stderr.write("\rDumping pads... [{0}] \n".format(numpads))
sys.stderr.flush()
def humanize_bytes(bytes, precision=0):
"""Return a humanized string representation of a number of bytes.
Assumes `from __future__ import division`.
>>> humanize_bytes(1)
'1 byte'
>>> humanize_bytes(1024)
'1.0 kB'
>>> humanize_bytes(1024*123)
'123.0 kB'
>>> humanize_bytes(1024*12342)
'12.1 MB'
>>> humanize_bytes(1024*12342,2)
'12.05 MB'
>>> humanize_bytes(1024*1234,2)
'1.21 MB'
>>> humanize_bytes(1024*1234*1111,2)
'1.31 GB'
>>> humanize_bytes(1024*1234*1111,1)
'1.3 GB'
"""
abbrevs = (
(1<<50L, 'Petabyte'),
(1<<40L, 'Tb'),
(1<<30L, 'Gb'),
(1<<20L, 'Mb'),
(1<<10L, 'kb'),
(1, 'bytes')
)
if bytes == 1:
return '1 byte'
for factor, suffix in abbrevs:
if bytes >= factor:
break
return '%.*f %s' % (precision, bytes / factor, suffix)
def padids_from_path (path):
from glob import glob
inputs = glob(os.path.join(path, "*.json"))
inputs.sort()
pads = []
for fp in inputs:
with open(fp) as f:
info = json.load(f)
info['path'] = fp
pads.append(info)
return pads
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# command
parser.add_argument('command', default="", help='command to perform: listpads, listgroups, dump, createindex')
# padinfo
parser.add_argument('--padinfo', default="padinfo.json", help='path to JSON file containing all pad login data, default padinfo.json, alternatively specify hostname, port, etc as separate options')
parser.add_argument('--hostname', default="", help='the hostname of the etherpad server')
parser.add_argument('--port', type=int, help='port of etherpad server')
parser.add_argument('--apikey', help='API key')
parser.add_argument('--apiversion', help='the version of the etherpad api')
parser.add_argument('--apiurl', help='URL path to the API')
parser.add_argument('--verbose', action="store_true", default=False, help='debug mode, verbose output')
parser.add_argument('--outputpath', default=os.getcwd(), help='path for output, default is .')
parser.add_argument('--pubpath', default="pub", help='path to dump public pads')
parser.add_argument('--grouppath', default="priv", help='path to dump group pads')
parser.add_argument('--templates', default=os.path.join(os.getcwd(), "templates"), help='(addition) templates path, default: ./templates')
# listpads/groups-specific
parser.add_argument('--lines', default=False, action="store_true", help='(listpads/groups) output one per line instead of JSON')
# dump-specific
parser.add_argument('--force', default=False, action="store_true", help='(dump) force dump even if up to date')
parser.add_argument('--skip', default=None, type=int, help='(dump) skip this many (start at index)')
parser.add_argument('--limit', default=None, type=int, help='(dump) stop after limit items')
# index-specific
parser.add_argument('--title', default="etherpad index &amp; archive", help='(index) title')
parser.add_argument('--exclude-groups', default=False, action="store_true", help='(index) ignore groups')
parser.add_argument('--groupinfo', default=None, help='(index) groupinfo json file')
parser.add_argument('--output', default=None, help='(index) path for output (default stdout)')
parser.add_argument('--pad', default="start", help='(history) pad id')
parser.add_argument('--rev', default="", help='(history) revision id')
args = parser.parse_args()
verbose = args.verbose
padinfo = PADINFO_DEFAULTS
if args.padinfo:
try:
with open(args.padinfo) as f:
for key, value in json.load(f).items():
padinfo[key] = value
except IOError, e:
print("WARNING: Tried to open {0}, but couldn't ({1})".format(args.padinfo, e), file=sys.stderr)
except ValueError, e:
print("WARNING: Error reading {0}, check the JSON syntax ({1})".format(args.padinfo, e))
# allow explicit opts to override
if args.hostname:
padinfo['hostname'] = args.hostname
if args.port:
padinfo['port'] = args.port
if args.apikey:
padinfo['apikey'] = args.apikey
if args.apiversion:
padinfo['apiversion'] = args.apiversion
if args.apiurl:
padinfo['apiurl'] = args.apiurl
padserver = PadServer(
hostname=padinfo.get("hostname"),
port=padinfo.get("port"),
apipath=padinfo.get("apiurl"),
apiversion=padinfo.get("apiversion"),
apikey=padinfo.get("apikey")
)
print ("Connecting to {0}".format(padserver.apiurl), file=sys.stderr)
###############################
# Command Dispatch
###############################
cmd = args.command.lower()
if cmd == "listpads":
padids = padserver.listAllPads()
if not args.lines:
json.dump(padids, sys.stdout)
else:
for padid in padids:
print(padid)
elif cmd == "listgroups":
groupids = padserver.listAllGroups()
if not args.lines:
json.dump(groupids, sys.stdout)
else:
for gid in groupids:
print(gid)
elif cmd == "dump":
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
start = time.time()
padids = padserver.listAllPads()
if args.skip:
padids = padids[args.skip:]
if args.limit:
padids = padids[:args.limit]
dumpPads(
padserver,
padids,
args.outputpath,
args.pubpath,
args.grouppath,
force=args.force,
templates=args.templates,
groupinfo=groupinfo)
if verbose:
print ("Completed in {0:0.0f} seconds".format(time.time()-start), file=sys.stderr)
elif cmd == "index":
def augment_info(info, groupinfo):
if info.get("last_edited"):
dt = datetime.strptime( info.get("last_edited"), "%Y-%m-%dT%H:%M:%S" )
info['last_edited_parsed'] = dt
info['last_edited_str'] = str(dt)
if groupinfo:
gid = info.get("group_id")
if gid.startswith("g."):
gid = gid[2:]
if gid in groupinfo:
info[u"group_name"] = groupinfo[gid].get("name")
# print (info, file=sys.stderr)
return info
def get_pads(groupinfo=None):
pads = padids_from_path(os.path.join(args.outputpath, args.pubpath))
pads = [augment_info(x, groupinfo) for x in pads]
# print (("padids_from_path", args.pubpath, pads), file=sys.stderr)
gp = os.path.join(args.outputpath, args.grouppath)
if not args.exclude_groups and gp:
groups = [os.path.join(gp, x) for x in os.listdir(gp)]
groups = [x for x in groups if os.path.isdir(x)]
groups.sort()
for gp in groups:
if groupinfo:
b = os.path.basename(gp)
if b not in groupinfo:
continue
try:
pad_infos = padids_from_path(gp)
pad_infos = [augment_info(x, groupinfo) for x in pad_infos]
pads.extend(pad_infos)
except OSError:
pass
return pads
groupinfo = None
if args.groupinfo:
with open(args.groupinfo) as gif:
groupinfo = json.load(gif)
if verbose:
print ("Using groupinfo", file=sys.stderr)
pads = get_pads(groupinfo)
padids = [(x.get("pad_name").lower(), x) for x in pads]
padids.sort()
pads = [x[1] for x in padids]
out = sys.stdout
if args.output:
out = open(os.path.join(args.outputpath, args.output), "w")
import jinja2
env = get_template_env(args.templates)
index_template = env.get_template("index.html")
out.write(index_template.render(
pads = pads,
title = args.title,
timestamp = datetime.now()
).encode("utf-8"))
if args.output:
out.close()
elif cmd == "revisions":
print (padserver.getRevisionsCount(args.pad))
elif cmd == "authors":
print (padserver.listAuthorsOfPad(args.pad))
elif cmd == "changeset":
print (padserver.getRevisionChangeset(args.pad, args.rev))
elif cmd == "history":
revs = padserver.getRevisionsCount(args.pad)
data = padserver.createDiffHTML(args.pad, 1, revs)
print (data['html'])
else:
print ("Command '{0}' not understood, try: listpads, listgroups, dump".format(args.command), file=sys.stderr)

83
padserver.py

@ -1,83 +0,0 @@
import json
from urllib2 import urlopen, HTTPError, URLError
from urllib import urlencode
class PadServer (object):
def __init__ (self, hostname, port=9001, apipath="/api/", apiversion="1.2.9", apikey=None, secure=False):
self.hostname = hostname
if secure:
self.protocol = "https"
else:
self.protocol = "http"
self.apiurl = self.protocol+"://"+hostname
if port:
self.apiurl += ":{0}".format(port)
self.apiurl += "{0}{1}/".format(apipath, apiversion)
self.apikey = apikey
def listAllPads (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllPads?'+urlencode(data)
return json.load(urlopen(url))['data']['padIDs']
def listAllGroups (self):
data = {'apikey': self.apikey}
url = self.apiurl+'listAllGroups?'+urlencode(data)
return json.load(urlopen(url))['data']['groupIDs']
def getPadText (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getText?'+urlencode(data)))['data']['text']
def getPadHTML (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
return json.load(urlopen(self.apiurl+'getHTML?'+urlencode(data)))['data']['html']
def getPadLastEdited (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
raw = json.load(urlopen(self.apiurl+'getLastEdited?'+urlencode(data)))['data']['lastEdited']
try:
return datetime.fromtimestamp(int(raw)/1000)
except TypeError as e:
return None
def getPadURL (self, padID, groupinfo=None):
group, name = pad_split_group(padID)
if group:
gid = group
if gid.startswith("g."):
gid = gid[2:]
if groupinfo:
ginfo = groupinfo.get(gid)
if ginfo:
groupID = ginfo.get("id", 0)
else:
groupID = 0
else:
groupID = 0
return self.protocol+"://"+self.hostname+"/group.html/"+str(groupID)+"/pad.html/"+padID
else:
return self.protocol+"://"+self.hostname+"/public_pad/"+padID
def getRevisionsCount (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
raw = json.load(urlopen(self.apiurl+'getRevisionsCount?'+urlencode(data)))['data']['revisions']
return int(raw)
def listAuthorsOfPad (self, padID):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8")}
raw = json.load(urlopen(self.apiurl+'listAuthorsOfPad?'+urlencode(data)))['data']['authorIDs']
return raw
def getRevisionChangeset (self, padID, rev=None):
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8"), 'rev': "{0}".format(rev)}
raw = json.load(urlopen(self.apiurl+'getRevisionChangeset?'+urlencode(data)))
return raw
def createDiffHTML (self, padID, startRev=0, endRev=None):
if endRev == None:
endRev = getRevisionsCount(self, padID)
data = {'apikey': self.apikey, 'padID': padID.encode("utf-8"), 'startRev': "{0}".format(startRev), 'endRev': "{0}".format(endRev)}
raw = json.load(urlopen(self.apiurl+'createDiffHTML?'+urlencode(data)))['data']
return raw
Loading…
Cancel
Save