Browse Source

continued tweaks

add-quote-import
Michael Murtaugh 9 years ago
parent
commit
0f340433ae
  1. 456
      etherdump/commands/dump.py
  2. 1
      etherdump/data/templates/pad_colors.html
  3. 6
      padinfo.sample.json

456
etherdump/commands/dump.py

@ -1,6 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
# License: AGPL # License: AGPL
# #
#
# todo:
# Capture exceptions... add HTTP status errors (502) to meta!!
# so that an eventual index can show the problematic pages!
# Also: provide links to text only / html versions when diff HTML fails
from __future__ import print_function from __future__ import print_function
from etherdump import DATAPATH from etherdump import DATAPATH
@ -12,6 +17,7 @@ from datetime import datetime
from xml.etree import cElementTree as ET from xml.etree import cElementTree as ET
from urllib import urlencode from urllib import urlencode
from urllib2 import urlopen, HTTPError, URLError from urllib2 import urlopen, HTTPError, URLError
from time import sleep
# external dependencies (use pip to install these) # external dependencies (use pip to install these)
import html5lib, jinja2 import html5lib, jinja2
@ -102,6 +108,11 @@ def get_template_env (tpath=None):
env = jinja2.Environment(loader=loader) env = jinja2.Environment(loader=loader)
return env return env
def get_group_info(gid, info):
if 'groups' in info:
if gid in info['groups']:
return info['groups'][gid]
def main(args): def main(args):
p = ArgumentParser(""" p = ArgumentParser("""
_ _ _ _ _ _
@ -117,7 +128,6 @@ def main(args):
p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output") p.add_argument("--verbose", default=False, action="store_true", help="flag for verbose output")
p.add_argument("--limit", type=int, default=None) p.add_argument("--limit", type=int, default=None)
p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads") p.add_argument("--allpads", default=False, action="store_true", help="flag to process all pads")
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads")
p.add_argument("--templatepath", default=os.path.join(DATAPATH, "templates"), help="directory with templates (override default files)") p.add_argument("--templatepath", default=os.path.join(DATAPATH, "templates"), help="directory with templates (override default files)")
p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html") p.add_argument("--colors-template", default="pad_colors.html", help="pad with authorship colors template name: pad_colors.html")
p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'") p.add_argument("--padlink", default=[], action="append", help="give a pad link pattern, example: 'http\:\/\/10\.1\.10\.1/p/(.*)'")
@ -126,8 +136,11 @@ def main(args):
p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)") p.add_argument("--showurls", default=False, action="store_true", help="flag to display API URLs that are used (to stderr)")
p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths") p.add_argument("--hidepaths", default=False, action="store_true", help="flag to not display paths")
p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save") p.add_argument("--pretend", default=False, action="store_true", help="flag to not actually save")
p.add_argument("--linkify", default=False, action="store_true", help="flag to process [[link]] forms (and follow when --spider is used)")
p.add_argument("--spider", default=False, action="store_true", help="flag to spider pads (requires --linkify)")
p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags") p.add_argument("--add-images", default=False, action="store_true", help="flag to add image tags")
p.add_argument("--authors-css", default="authors.css", help="filename to save collected authorship css (nb: etherdump will overwrite this file!)") p.add_argument("--force", default=False, action="store_true", help="force dump (even if not updated since last dump)")
p.add_argument("--authors-css", default=None, help="filename to save collected authorship css (nb: any existing file will be mercilessly overwritten), default: don't accumulate css")
# TODO css from pad --- ie specify a padid for a stylesheet!!!!!! # TODO css from pad --- ie specify a padid for a stylesheet!!!!!!
# p.add_argument("--css", default="styles.css", help="padid of stylesheet") # p.add_argument("--css", default="styles.css", help="padid of stylesheet")
@ -175,205 +188,270 @@ def main(args):
done.add(padid) done.add(padid)
data['padID'] = padid.encode("utf-8") data['padID'] = padid.encode("utf-8")
if args.verbose: if args.verbose:
print ("PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr) print (u"PADID \"{0}\"".format(padid).encode("utf-8"), file=sys.stderr)
# g.yIRLMysh0PMsCMHc$
grouppat = re.compile(ur"^g\.(\w+)\$(.+)$")
m = grouppat.search(padid)
if m:
group = m.group(1)
ginfo = get_group_info(group, info)
if not ginfo:
print ("No info for group '{0}', skipping".format(group), file=sys.stderr)
continue
padid = m.group(2)
else:
group = None
ginfo = None
if not args.pretend: if not args.pretend:
try: try:
os.makedirs(args.path) if ginfo:
os.makedirs(os.path.join(args.path, ginfo['name']))
else:
os.makedirs(args.path)
except OSError: except OSError:
pass pass
try: retry = True
tries = 1
# _ while retry:
# _ __ ___ ___| |_ __ _ retry = False
# | '_ ` _ \ / _ \ __/ _` | try:
# | | | | | | __/ || (_| |
# |_| |_| |_|\___|\__\__,_|
meta_url = urlify(padid, ext=".json")
meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_url = urlify(padid, ext=".txt")
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
colors_url = urlify(padid, ext=".html")
colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
if not args.hidepaths:
print (meta_out, file=sys.stderr)
if not args.pretend:
meta = {}
meta['padid'] = padid
revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
if args.showurls:
print (revisions_url, file=sys.stderr)
meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
lastedited_url = apiurl+'getLastEdited?'+urlencode(data) # _
if args.showurls: # _ __ ___ ___| |_ __ _
print (lastedited_url, file=sys.stderr) # | '_ ` _ \ / _ \ __/ _` |
lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited'] # | | | | | | __/ || (_| |
meta['lastedited_raw'] = lastedited_raw # |_| |_| |_|\___|\__\__,_|
meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type) meta_url = urlify(padid, ext=".json")
authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data) raw_url = urlify(padid, ext=".txt")
if args.showurls: colors_url = urlify(padid, ext=".html")
print (authors_url, file=sys.stderr)
meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs'] if ginfo:
meta['colors'] = colors_url meta_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], meta_url.encode("utf-8"))
meta['raw'] = raw_url raw_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], raw_url.encode("utf-8"))
meta['meta'] = meta_url colors_out = "{0}/{1}/{2}".format(args.path, ginfo['name'], colors_url.encode("utf-8"))
with open(meta_out, "w") as f: else:
json.dump(meta, f) meta_out = "{0}/{1}".format(args.path, meta_url.encode("utf-8"))
raw_out = "{0}/{1}".format(args.path, raw_url.encode("utf-8"))
# _ __ __ ___ __ colors_out = "{0}/{1}".format(args.path, colors_url.encode("utf-8"))
# | '__/ _` \ \ /\ / /
# | | | (_| |\ V V / if not args.pretend:
# |_| \__,_| \_/\_/ meta = {}
meta['padid'] = padid
if not args.hidepaths: revisions_url = apiurl+'getRevisionsCount?'+urlencode(data)
print (raw_out, file=sys.stderr) if args.showurls:
text_url = apiurl+"getText?"+urlencode(data) print (revisions_url, file=sys.stderr)
if args.showurls: meta['total_revisions'] = json.load(urlopen(revisions_url))['data']['revisions']
print (text_url, file=sys.stderr)
if not args.pretend: # CHECK REVISIONS (against existing meta)
rawText = json.load(urlopen(text_url))['data']['text'] if meta['total_revisions'] == 0:
with open(raw_out, "w") as f: if args.verbose:
f.write(rawText.encode("utf-8")) print (" pad has no revisions, skipping", file=sys.stderr)
continue
# _ _ _ if os.path.exists(meta_out):
# | |__ | |_ _ __ ___ | | with open(meta_out) as f:
# | '_ \| __| '_ ` _ \| | old_meta = json.load(f)
# | | | | |_| | | | | | | if not args.force and old_meta['total_revisions'] == meta['total_revisions']:
# |_| |_|\__|_| |_| |_|_|
# todo ? -- regular HTML output
# _
# ___ ___ | | ___ _ __ ___
# / __/ _ \| |/ _ \| '__/ __|
# | (_| (_) | | (_) | | \__ \
# \___\___/|_|\___/|_| |___/
if not args.hidepaths:
print (colors_out, file=sys.stderr)
data['startRev'] = "0"
colors_url = apiurl+'createDiffHTML?'+urlencode(data)
if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
html, links = linkify(html)
if args.spider:
for l in links:
if l not in todo and l not in done:
if l.startswith("http://") or l.startswith("https://"):
if args.verbose: if args.verbose:
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr) print (" skipping (up to date)", file=sys.stderr)
continue continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr) lastedited_url = apiurl+'getLastEdited?'+urlencode(data)
todo.append(l) if args.showurls:
print (lastedited_url, file=sys.stderr)
# Stage 2: Process as ElementTree lastedited_raw = json.load(urlopen(lastedited_url))['data']['lastEdited']
# meta['lastedited_raw'] = lastedited_raw
t = html5lib.parse(html, namespaceHTMLElements=False) meta['lastedited'] = datetime.fromtimestamp(int(lastedited_raw)/1000).isoformat()
# apply linkpats
for a in t.findall(".//a"): # author_ids (unfortunately, this is a list of internal etherpad author ids -- not the names ppl type)
href = a.attrib.get("href") authors_url = apiurl+'listAuthorsOfPad?'+urlencode(data)
original_href = href if args.showurls:
if href: print (authors_url, file=sys.stderr)
# if args.verbose: meta['author_ids'] = json.load(urlopen(authors_url))['data']['authorIDs']
# print ("searching for PADLINK: {0}".format(href)) meta['colors'] = colors_url
for pat in padlinkpats: meta['raw'] = raw_url
if re.search(pat, href) != None: meta['meta'] = meta_url
# if args.verbose: # defer output to LAST STEP (as confirmation)
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href) # _ __ __ ___ __
padid = filename_to_padid(href) # | '__/ _` \ \ /\ / /
set_text_contents(a, "[[{0}]]".format(padid)) # | | | (_| |\ V V /
if padid not in todo and padid not in done: # |_| \__,_| \_/\_/
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr) text_url = apiurl+"getText?"+urlencode(data)
todo.append(padid) if args.showurls:
# apply linkpats print (text_url, file=sys.stderr)
for s, r in linkpats: if not args.pretend:
href = re.sub(s, r, href) rawText = json.load(urlopen(text_url))['data']['text']
if href != original_href: if rawText.strip() == "":
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose: if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr) print (" empty text, skipping", file=sys.stderr)
a.attrib['href'] = href continue
if not args.hidepaths:
# SHOWIMAGES : inject img tag for (local) images print (raw_out, file=sys.stderr)
if args.add_images: with open(raw_out, "w") as f:
ext = os.path.splitext(href)[1].lower().lstrip(".") f.write(rawText.encode("utf-8"))
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a) # _ _ _
print ("Adding img '{0}'".format(href), file=sys.stderr) # | |__ | |_ _ __ ___ | |
img = ET.SubElement(a, "img") # | '_ \| __| '_ ` _ \| |
br = ET.SubElement(a, "br") # | | | | |_| | | | | | |
a.remove(img); a.insert(0, img) # |_| |_|\__|_| |_| |_|_|
a.remove(br); a.insert(1, br)
img.attrib['src'] = href # todo ? -- regular HTML output
# extract the style tag (with authorship colors) # _
style = t.find(".//style") # ___ ___ | | ___ _ __ ___
if style != None: # / __/ _ \| |/ _ \| '__/ __|
if args.authors_css: # | (_| (_) | | (_) | | \__ \
for i in style.text.splitlines(): # \___\___/|_|\___/|_| |___/
if len(i):
selector, rule = i.split(' ',1) if not args.hidepaths:
authors_css_rules[selector] = rule print (colors_out, file=sys.stderr)
style = '' # strip the individual style tag from each page (only exports to authors-css file) data['startRev'] = "0"
# nb: it's up to the template to refer to the authors-css file colors_url = apiurl+'createDiffHTML?'+urlencode(data)
if args.showurls:
print (colors_url, file=sys.stderr)
html = json.load(urlopen(colors_url))['data']['html']
t = html5lib.parse(html, namespaceHTMLElements=False)
trim_removed_spans(t)
html = ET.tostring(t, method="html")
# Stage 1: Process as text
# Process [[wikilink]] style links
# and (optionally) add linked page names to spider todo list
if args.linkify:
html, links = linkify(html)
if args.spider:
for l in links:
if l not in todo and l not in done:
if l.startswith("http://") or l.startswith("https://"):
if args.verbose:
print ("Ignoring absolute URL in [[ link ]] form", file=sys.stderr)
continue
# if args.verbose:
# print (" link: {0}".format(l), file=sys.stderr)
todo.append(l)
# Stage 2: Process as ElementTree
#
t = html5lib.parse(html, namespaceHTMLElements=False)
# apply linkpats
for a in t.findall(".//a"):
href = a.attrib.get("href")
original_href = href
if href:
# if args.verbose:
# print ("searching for PADLINK: {0}".format(href))
for pat in padlinkpats:
if re.search(pat, href) != None:
# if args.verbose:
# print (" found PADLINK: {0}".format(href))
href = re.sub(pat, "\\1.html", href)
padid = filename_to_padid(href)
set_text_contents(a, "[[{0}]]".format(padid))
if padid not in todo and padid not in done:
if args.verbose:
print (" link: {0}".format(padid), file=sys.stderr)
todo.append(padid)
# apply linkpats
for s, r in linkpats:
href = re.sub(s, r, href)
if href != original_href:
old_contents = text_contents(a)
# print ("OLD_CONTENTS {0}".format(old_contents))
if old_contents == original_href:
if args.verbose:
print (" Updating href IN TEXT", file=sys.stderr)
set_text_contents(a, href)
if original_href != href:
if args.verbose:
print (" Changed href from {0} to {1}".format(original_href, href), file=sys.stderr)
a.attrib['href'] = href
# SHOWIMAGES : inject img tag for (local) images
if args.add_images:
ext = os.path.splitext(href)[1].lower().lstrip(".")
if ext in ("png", "gif", "jpeg", "jpg"):
# ap = _parent(a)
print ("Adding img '{0}'".format(href), file=sys.stderr)
img = ET.SubElement(a, "img")
br = ET.SubElement(a, "br")
a.remove(img); a.insert(0, img)
a.remove(br); a.insert(1, br)
img.attrib['src'] = href
# extract the style tag (with authorship colors)
style = t.find(".//style")
if style != None:
if args.authors_css:
for i in style.text.splitlines():
if len(i):
selector, rule = i.split(' ',1)
authors_css_rules[selector] = rule
# replace individual style with a ref to the authors-css
style = '<link rel="stylesheet" type="text/css" href="{0}">'.format(args.authors_css)
else:
style = ET.tostring(style, method="html")
else: else:
style = ET.tostring(style, method="html") style = ""
else: # and extract the contents of the body
style = "" html = contents(t.find(".//body"))
# and extract the contents of the body
html = contents(t.find(".//body")) if not args.pretend:
with open(colors_out, "w") as f:
if not args.pretend: # f.write(html.encode("utf-8"))
with open(colors_out, "w") as f: f.write(colors_template.render(
# f.write(html.encode("utf-8")) html = html,
f.write(colors_template.render( style = style,
html = html, revision = meta['total_revisions'],
style = style, padid = padid,
revision = meta['total_revisions'], timestamp = datetime.now(),
padid = padid, meta_url = meta_url,
timestamp = datetime.now(), raw_url = raw_url,
meta_url = meta_url, colors_url = colors_url,
raw_url = raw_url, lastedited = meta['lastedited']
colors_url = colors_url, ).encode("utf-8"))
lastedited = meta['lastedited']
).encode("utf-8")) # OUTPUT METADATA (finally)
if not args.hidepaths:
# _ print (meta_out, file=sys.stderr)
# | | ___ ___ _ __ with open(meta_out, "w") as f:
# | |/ _ \ / _ \| '_ \ json.dump(meta, f)
# | | (_) | (_) | |_) | # _
# |_|\___/ \___/| .__/ # | | ___ ___ _ __
# |_| # | |/ _ \ / _ \| '_ \
# | | (_) | (_) | |_) |
count += 1 # |_|\___/ \___/| .__/
if args.limit and count >= args.limit: # |_|
break
except TypeError: count += 1
print ("ERROR, skipping!", file=sys.stderr) if args.limit and count >= args.limit:
break
# except HTTPError as e:
# retry = True
# except TypeError as e:
# print ("TypeError, skipping!", file=sys.stderr)
except Exception as e:
print ("[{0}] Exception: {1}".format(tries, e), file=sys.stderr)
sleep(3)
retry = True
if retry:
tries += 1
if tries > 5:
print (" GIVING UP", file=sys.stderr)
retry = False
# Write the unified CSS with authors # Write the unified CSS with authors
if args.authors_css: if args.authors_css:

1
etherdump/data/templates/pad_colors.html

@ -5,7 +5,6 @@
<meta charset="utf-8"> <meta charset="utf-8">
<meta revision="{{revision}}"> <meta revision="{{revision}}">
<link rel="stylesheet" type="text/css" href="pad.css"> <link rel="stylesheet" type="text/css" href="pad.css">
<link rel="stylesheet" type="text/css" href="authors.css">
{{ style }} {{ style }}
</head> </head>
<body> <body>

6
padinfo.sample.json

@ -4,5 +4,9 @@
"hostname": "localhost", "hostname": "localhost",
"apiversion": "1.2.9", "apiversion": "1.2.9",
"apiurl": "/api/", "apiurl": "/api/",
"apikey": "8f55f9ede1b3f5d88b3c54eb638225a7bb71c64867786b608abacfdb7d418be1" "apikey": "8f55f9ede1b3f5d88b3c54eb638225a7bb71c64867786b608abacfdb7d418be1",
"groups": {
"71FpVh4MZBvl8VZ6": {"name": "Transmediale", "id": 43},
"HyYfoX3Q6S5utxs5": {"name": "test", "id": 42 }
}
} }

Loading…
Cancel
Save