from __future__ import print_function from argparse import ArgumentParser import sys, json, re, os from datetime import datetime try: # python2 from urllib2 import urlopen, URLError, HTTPError from urllib import urlencode, quote except ImportError: # python3 from urllib.parse import urlencode, quote from urllib.request import urlopen, URLError, HTTPError from etherdump.commands.common import * from time import sleep from etherdump.commands.html5tidy import html5tidy import html5lib from xml.etree import ElementTree as ET from fnmatch import fnmatch # debugging # import ElementTree as ET """ pull(meta): Update meta data files for those that have changed. Check for changed pads by looking at revisions & comparing to existing todo... use/prefer public interfaces ? (export functions) """ def try_deleting (files): for f in files: try: os.remove(f) except OSError as e: pass def main (args): p = ArgumentParser("Check for pads that have changed since last sync (according to .meta.json)") p.add_argument("padid", nargs="*", default=[]) p.add_argument("--glob", default=False, help="download pads matching a glob pattern") p.add_argument("--padinfo", default=".etherdump/settings.json", help="settings, default: .etherdump/settings.json") p.add_argument("--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)") p.add_argument("--pub", default="p", help="folder to store files for public pads, default: p") p.add_argument("--group", default="g", help="folder to store files for group pads, default: g") p.add_argument("--skip", default=None, type=int, help="skip this many items, default: None") p.add_argument("--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False") p.add_argument("--text", default=False, action="store_true", help="download text to PADID.txt, default: False") p.add_argument("--html", default=False, action="store_true", help="download html to PADID.html, default: False") p.add_argument("--dhtml", default=False, action="store_true", help="download dhtml to PADID.diff.html, default: False") p.add_argument("--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False") p.add_argument("--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False") p.add_argument("--output", default=False, action="store_true", help="output changed padids on stdout") p.add_argument("--force", default=False, action="store_true", help="reload, even if revisions count matches previous") p.add_argument("--no-raw-ext", default=False, action="store_true", help="save plain text as padname with no (additional) extension") p.add_argument("--fix-names", default=False, action="store_true", help="normalize padid's (no spaces, special control chars) for use in file names") p.add_argument("--filter-ext", default=None, help="filter pads by extension") p.add_argument("--css", default="/styles.css", help="add css url to output pages, default: /styles.css") p.add_argument("--script", default="/versions.js", help="add script url to output pages, default: /versions.js") p.add_argument("--nopublish", default="__NOPUBLISH__", help="no publish magic word, default: __NOPUBLISH__") args = p.parse_args(args) raw_ext = ".raw.txt" if args.no_raw_ext: raw_ext = "" info = loadpadinfo(args.padinfo) data = {} data['apikey'] = info['apikey'] if args.padid: padids = args.padid elif args.glob: padids = getjson(info['localapiurl']+'listAllPads?'+urlencode(data))['data']['padIDs'] padids = [x for x in padids if fnmatch(x, args.glob)] else: padids = getjson(info['localapiurl']+'listAllPads?'+urlencode(data))['data']['padIDs'] padids.sort() numpads = len(padids) # maxmsglen = 0 count = 0 for i, padid in enumerate(padids): if args.skip != None and i 3: print ("Too many failures ({0}), skipping".format(padid), file=sys.stderr) skip=True break else: sleep(3) except TypeError as e: print ("Type Error loading pad {0} (phantom pad?), skipping".format(padid), file=sys.stderr) skip=True break if skip: continue count += 1 if args.output: print (padid) if args.all or (args.meta or args.text or args.html or args.dhtml): try: os.makedirs(os.path.split(metapath)[0]) except OSError: pass if args.all or args.text: text = getjson(info['localapiurl']+'getText?'+urlencode(data)) ver = {"type": "text"} versions.append(ver) ver["code"] = text["_code"] if text["_code"] == 200: text = text['data']['text'] ########################################## ## ENFORCE __NOPUBLISH__ MAGIC WORD ########################################## if args.nopublish and args.nopublish in text: # NEED TO PURGE ANY EXISTING DOCS try_deleting((p+raw_ext,p+".raw.html",p+".diff.html",p+".meta.json")) continue ver["path"] = p+raw_ext ver["url"] = quote(ver["path"]) with open(ver["path"], "w") as f: f.write(text) # once the content is settled, compute a hash # and link it in the metadata! links = [] if args.css: links.append({"href":args.css, "rel":"stylesheet"}) # todo, make this process reflect which files actually were made versionbaseurl = quote(padid) links.append({"href":versions[0]["url"], "rel":"alternate", "type":"text/html", "title":"Etherpad"}) if args.all or args.text: links.append({"href":versionbaseurl+raw_ext, "rel":"alternate", "type":"text/plain", "title":"Plain text"}) if args.all or args.html: links.append({"href":versionbaseurl+".raw.html", "rel":"alternate", "type":"text/html", "title":"HTML"}) if args.all or args.dhtml: links.append({"href":versionbaseurl+".diff.html", "rel":"alternate", "type":"text/html", "title":"HTML with author colors"}) if args.all or args.meta: links.append({"href":versionbaseurl+".meta.json", "rel":"alternate", "type":"application/json", "title":"Meta data"}) # links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"}) if args.all or args.dhtml: data['startRev'] = "0" html = getjson(info['localapiurl']+'createDiffHTML?'+urlencode(data)) ver = {"type": "diffhtml"} versions.append(ver) ver["code"] = html["_code"] if html["_code"] == 200: try: html = html['data']['html'] ver["path"] = p+".diff.html" ver["url"] = quote(ver["path"]) # doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False) doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links) with open(ver["path"], "w") as f: # f.write(html.encode("utf-8")) print(ET.tostring(doc, method="html", encoding="unicode"), file=f) except TypeError: # Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file! ver["message"] = html["message"] # with open(ver["path"], "w") as f: # print ("""
{0}
""".format(json.dumps(html, indent=2)), file=f) # Process text, html, dhtml, all options if args.all or args.html: html = getjson(info['localapiurl']+'getHTML?'+urlencode(data)) ver = {"type": "html"} versions.append(ver) ver["code"] = html["_code"] if html["_code"] == 200: html = html['data']['html'] ver["path"] = p+".raw.html" ver["url"] = quote(ver["path"]) doc = html5lib.parse(html, treebuilder="etree", namespaceHTMLElements=False) html5tidy(doc, indent=True, title=padid, scripts=args.script, links=links) with open(ver["path"], "w") as f: # f.write(html.encode("utf-8")) print (ET.tostring(doc, method="html", encoding="unicode"), file=f) # output meta if args.all or args.meta: ver = {"type": "meta"} versions.append(ver) ver["path"] = metapath ver["url"] = quote(metapath) with open(metapath, "w") as f: json.dump(meta, f, indent=2) print("\n{0} pad(s) loaded".format(count), file=sys.stderr)