diff --git a/README.md b/README.md index c96a707..31c5e24 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,14 @@ After installing etherpump on the Varia server, we collectively decided to not w Change log / notes ================== +** January 2020** + +Added experimental [Trio](trio.readthedocs.io) support for the `pull` command +which enables pads to be processed concurrently. The default `--connection` +option is set to 20 which may overpower the target server. If in doubt, set +this to a lower number (like 5). This functionality is experimental, be +cautious and please report bugs! + **October 2019** Improve `etherpump --help` handling to make it easier for new users. diff --git a/etherpump/commands/common.py b/etherpump/commands/common.py index f5b6de9..83a2fae 100644 --- a/etherpump/commands/common.py +++ b/etherpump/commands/common.py @@ -7,6 +7,8 @@ from time import sleep from urllib.parse import quote_plus, unquote_plus from urllib.request import HTTPError, urlopen +import trio + groupnamepat = re.compile(r"^g\.(\w+)\$") @@ -73,6 +75,27 @@ def getjson(url, max_retry=3, retry_sleep_time=3): return ret +async def agetjson(session, url): + """The asynchronous version of getjson.""" + RETRY = 20 + TIMEOUT = 10 + + ret = {} + ret["_retries"] = 0 + + try: + response = await session.get(url, timeout=TIMEOUT, retries=RETRY) + rurl = response.url + ret.update(response.json()) + ret["_code"] = response.status_code + if rurl != url: + ret["_url"] = rurl + return ret + except Exception as e: + print('Failed to download {}, saw {}'.format(url, str(e))) + return + + def loadpadinfo(p): with open(p) as f: info = json.load(f) @@ -81,7 +104,10 @@ def loadpadinfo(p): return info -# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities: +# Python developer Fredrik Lundh (author of elementtree, among other things) +# has such a function on his website, which works with decimal, hex and named +# entities: + ## # Removes HTML or XML character references and entities from a text string. # @@ -112,3 +138,8 @@ def unescape(text): def istty(): return sys.stdout.isatty() and os.environ.get('TERM') != 'dumb' + + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i : i + n] diff --git a/etherpump/commands/pull.py b/etherpump/commands/pull.py index 235ccfd..72dcdd6 100644 --- a/etherpump/commands/pull.py +++ b/etherpump/commands/pull.py @@ -1,18 +1,20 @@ """Check for pads that have changed since last sync (according to .meta.json)""" import json +import math import os import re import sys from argparse import ArgumentParser from datetime import datetime from fnmatch import fnmatch -from time import sleep from urllib.parse import quote, urlencode from urllib.request import HTTPError from xml.etree import ElementTree as ET +import asks import html5lib +import trio from tqdm import tqdm from etherpump.commands.common import * # noqa @@ -36,410 +38,362 @@ def try_deleting(files): pass -def main(args): - p = ArgumentParser( +def build_argument_parser(args): + parser = ArgumentParser( "Check for pads that have changed since last sync (according to .meta.json)" ) - - p.add_argument("padid", nargs="*", default=[]) - p.add_argument( + parser.add_argument("padid", nargs="*", default=[]) + parser.add_argument( "--glob", default=False, help="download pads matching a glob pattern" ) - - p.add_argument( + parser.add_argument( "--padinfo", default=".etherpump/settings.json", help="settings, default: .etherpump/settings.json", ) - p.add_argument( + parser.add_argument( "--zerorevs", default=False, action="store_true", help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)", ) - p.add_argument( + parser.add_argument( "--pub", default="p", help="folder to store files for public pads, default: p", ) - p.add_argument( + parser.add_argument( "--group", default="g", help="folder to store files for group pads, default: g", ) - p.add_argument( + parser.add_argument( "--skip", default=None, type=int, help="skip this many items, default: None", ) - p.add_argument( + parser.add_argument( + "--connection", + default=5, + type=int, + help="number of connections to run concurrently", + ) + parser.add_argument( "--meta", default=False, action="store_true", help="download meta to PADID.meta.json, default: False", ) - p.add_argument( + parser.add_argument( "--text", default=False, action="store_true", help="download text to PADID.txt, default: False", ) - p.add_argument( + parser.add_argument( "--html", default=False, action="store_true", help="download html to PADID.html, default: False", ) - p.add_argument( + parser.add_argument( "--dhtml", default=False, action="store_true", help="download dhtml to PADID.diff.html, default: False", ) - p.add_argument( + parser.add_argument( "--all", default=False, action="store_true", help="download all files (meta, text, html, dhtml), default: False", ) - p.add_argument( + parser.add_argument( "--folder", default=False, action="store_true", help="dump files in a folder named PADID (meta, text, html, dhtml), default: False", ) - p.add_argument( + parser.add_argument( "--output", default=False, action="store_true", help="output changed padids on stdout", ) - p.add_argument( + parser.add_argument( "--force", default=False, action="store_true", help="reload, even if revisions count matches previous", ) - p.add_argument( + parser.add_argument( "--no-raw-ext", default=False, action="store_true", help="save plain text as padname with no (additional) extension", ) - p.add_argument( + parser.add_argument( "--fix-names", default=False, action="store_true", help="normalize padid's (no spaces, special control chars) for use in file names", ) - - p.add_argument( + parser.add_argument( "--filter-ext", default=None, help="filter pads by extension" ) - - p.add_argument( + parser.add_argument( "--css", default="/styles.css", help="add css url to output pages, default: /styles.css", ) - p.add_argument( + parser.add_argument( "--script", default="/versions.js", help="add script url to output pages, default: /versions.js", ) - - p.add_argument( + parser.add_argument( "--nopublish", default="__NOPUBLISH__", help="no publish magic word, default: __NOPUBLISH__", ) - p.add_argument( + parser.add_argument( "--publish", default="__PUBLISH__", help="the publish magic word, default: __PUBLISH__", ) - p.add_argument( + parser.add_argument( "--publish-opt-in", default=False, action="store_true", help="ensure `--publish` is honoured instead of `--nopublish`", ) + return parser - args = p.parse_args(args) - - raw_ext = ".raw.txt" - if args.no_raw_ext: - raw_ext = "" - - info = loadpadinfo(args.padinfo) - data = {} - data['apikey'] = info['apikey'] +async def get_padids(args, info, data, session): if args.padid: padids = args.padid elif args.glob: - padids = getjson( - info['localapiurl'] + 'listAllPads?' + urlencode(data) - )['data']['padIDs'] + url = info['localapiurl'] + 'listAllPads?' + urlencode(data) + padids = await agetjson(session, url) + padids = padids['data']['padIDs'] padids = [x for x in padids if fnmatch(x, args.glob)] else: - padids = getjson( - info['localapiurl'] + 'listAllPads?' + urlencode(data) - )['data']['padIDs'] + url = info['localapiurl'] + 'listAllPads?' + urlencode(data) + padids = await agetjson(session, url) + padids = padids['data']['padIDs'] + padids.sort() - numpads = len(padids) - # maxmsglen = 0 - count = 0 + return padids - progress_kwargs = {} - if not istty(): - progress_kwargs.update(dict(disable=True)) - progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs) - for i, padid in enumerate(progress_pads): - if args.skip != None and i < args.skip: - continue +async def handle_pad(args, padid, data, info, session): + raw_ext = ".raw.txt" + if args.no_raw_ext: + raw_ext = "" - data['padID'] = padid - p = padpath(padid, args.pub, args.group, args.fix_names) - if args.folder: - p = os.path.join(p, padid) + data['padID'] = padid + p = padpath(padid, args.pub, args.group, args.fix_names) + if args.folder: + p = os.path.join(p, padid) - metapath = p + ".meta.json" - revisions = None - tries = 1 - skip = False - padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) - meta = {} + metapath = p + ".meta.json" + revisions = None + tries = 1 + skip = False + padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"]) + meta = {} - while True: - try: - if os.path.exists(metapath): - with open(metapath) as f: - meta.update(json.load(f)) - revisions = getjson( - info['localapiurl'] - + 'getRevisionsCount?' - + urlencode(data) - )['data']['revisions'] - if meta['revisions'] == revisions and not args.force: - skip = True - break - - meta['padid'] = padid - versions = meta["versions"] = [] - versions.append( - { - "url": padurlbase + quote(padid), - "type": "pad", - "code": 200, - } + while True: + try: + if os.path.exists(metapath): + with open(metapath) as f: + meta.update(json.load(f)) + url = ( + info['localapiurl'] + 'getRevisionsCount?' + urlencode(data) ) - - if revisions == None: - meta['revisions'] = getjson( - info['localapiurl'] - + 'getRevisionsCount?' - + urlencode(data) - )['data']['revisions'] - else: - meta['revisions'] = revisions - - if (meta['revisions'] == 0) and (not args.zerorevs): + response = await agetjson(session, url) + revisions = response['data']['revisions'] + if meta['revisions'] == revisions and not args.force: skip = True break - # todo: load more metadata! - meta['group'], meta['pad'] = splitpadname(padid) - meta['pathbase'] = p - meta['lastedited_raw'] = int( - getjson( - info['localapiurl'] + 'getLastEdited?' + urlencode(data) - )['data']['lastEdited'] + meta['padid'] = padid + versions = meta["versions"] = [] + versions.append( + {"url": padurlbase + quote(padid), "type": "pad", "code": 200,} + ) + + if revisions is None: + url = ( + info['localapiurl'] + 'getRevisionsCount?' + urlencode(data) ) - meta['lastedited_iso'] = datetime.fromtimestamp( - int(meta['lastedited_raw']) / 1000 - ).isoformat() - meta['author_ids'] = getjson( - info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data) - )['data']['authorIDs'] + response = await agetjson(session, url) + meta['revisions'] = response['data']['revisions'] + else: + meta['revisions'] = revisions + + if (meta['revisions'] == 0) and (not args.zerorevs): + skip = True break - except HTTPError as e: - tries += 1 - if tries > 3: - print( - "Too many failures ({0}), skipping".format(padid), - file=sys.stderr, - ) - skip = True - break - else: - sleep(3) - except TypeError as e: + + # todo: load more metadata! + meta['group'], meta['pad'] = splitpadname(padid) + meta['pathbase'] = p + + url = info['localapiurl'] + 'getLastEdited?' + urlencode(data) + response = await agetjson(session, url) + meta['lastedited_raw'] = int(response['data']['lastEdited']) + + meta['lastedited_iso'] = datetime.fromtimestamp( + int(meta['lastedited_raw']) / 1000 + ).isoformat() + + url = info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data) + response = await agetjson(session, url) + meta['author_ids'] = response['data']['authorIDs'] + + break + except HTTPError as e: + tries += 1 + if tries > 3: print( - "Type Error loading pad {0} (phantom pad?), skipping".format( - padid - ), + "Too many failures ({0}), skipping".format(padid), file=sys.stderr, ) skip = True break + else: + await trio.sleep(3) + except TypeError as e: + print( + "Type Error loading pad {0} (phantom pad?), skipping".format( + padid + ), + file=sys.stderr, + ) + skip = True + break - if skip: - continue + if skip: + return - count += 1 + if args.output: + print(padid) - if args.output: - print(padid) + if args.all or (args.meta or args.text or args.html or args.dhtml): + try: + os.makedirs(os.path.split(metapath)[0]) + except OSError: + pass - if args.all or (args.meta or args.text or args.html or args.dhtml): - try: - os.makedirs(os.path.split(metapath)[0]) - except OSError: - pass - - if args.all or args.text: - text = getjson(info['localapiurl'] + 'getText?' + urlencode(data)) - ver = {"type": "text"} - versions.append(ver) - ver["code"] = text["_code"] - if text["_code"] == 200: - text = text['data']['text'] - - ########################################## - ## ENFORCE __NOPUBLISH__ MAGIC WORD - ########################################## - if args.nopublish and args.nopublish in text: - # NEED TO PURGE ANY EXISTING DOCS - try_deleting( - ( - p + raw_ext, - p + ".raw.html", - p + ".diff.html", - p + ".meta.json", - ) + if args.all or args.text: + url = info['localapiurl'] + 'getText?' + urlencode(data) + text = await agetjson(session, url) + ver = {"type": "text"} + versions.append(ver) + ver["code"] = text["_code"] + if text["_code"] == 200: + text = text['data']['text'] + + ########################################## + ## ENFORCE __NOPUBLISH__ MAGIC WORD + ########################################## + if args.nopublish and args.nopublish in text: + # NEED TO PURGE ANY EXISTING DOCS + try_deleting( + ( + p + raw_ext, + p + ".raw.html", + p + ".diff.html", + p + ".meta.json", ) - continue - - ########################################## - ## ENFORCE __PUBLISH__ MAGIC WORD - ########################################## - if args.publish_opt_in and args.publish not in text: - try_deleting( - ( - p + raw_ext, - p + ".raw.html", - p + ".diff.html", - p + ".meta.json", - ) + ) + return + + ########################################## + ## ENFORCE __PUBLISH__ MAGIC WORD + ########################################## + if args.publish_opt_in and args.publish not in text: + try_deleting( + ( + p + raw_ext, + p + ".raw.html", + p + ".diff.html", + p + ".meta.json", ) - continue - - ver["path"] = p + raw_ext - ver["url"] = quote(ver["path"]) - with open(ver["path"], "w") as f: - f.write(text) - # once the content is settled, compute a hash - # and link it in the metadata! - - links = [] - if args.css: - links.append({"href": args.css, "rel": "stylesheet"}) - # todo, make this process reflect which files actually were made - versionbaseurl = quote(padid) + ) + return + + ver["path"] = p + raw_ext + ver["url"] = quote(ver["path"]) + with open(ver["path"], "w") as f: + f.write(text) + # once the content is settled, compute a hash + # and link it in the metadata! + + links = [] + if args.css: + links.append({"href": args.css, "rel": "stylesheet"}) + # todo, make this process reflect which files actually were made + versionbaseurl = quote(padid) + links.append( + { + "href": versions[0]["url"], + "rel": "alternate", + "type": "text/html", + "title": "Etherpad", + } + ) + if args.all or args.text: + links.append( + { + "href": versionbaseurl + raw_ext, + "rel": "alternate", + "type": "text/plain", + "title": "Plain text", + } + ) + if args.all or args.html: links.append( { - "href": versions[0]["url"], + "href": versionbaseurl + ".raw.html", "rel": "alternate", "type": "text/html", - "title": "Etherpad", + "title": "HTML", + } + ) + if args.all or args.dhtml: + links.append( + { + "href": versionbaseurl + ".diff.html", + "rel": "alternate", + "type": "text/html", + "title": "HTML with author colors", + } + ) + if args.all or args.meta: + links.append( + { + "href": versionbaseurl + ".meta.json", + "rel": "alternate", + "type": "application/json", + "title": "Meta data", } ) - if args.all or args.text: - links.append( - { - "href": versionbaseurl + raw_ext, - "rel": "alternate", - "type": "text/plain", - "title": "Plain text", - } - ) - if args.all or args.html: - links.append( - { - "href": versionbaseurl + ".raw.html", - "rel": "alternate", - "type": "text/html", - "title": "HTML", - } - ) - if args.all or args.dhtml: - links.append( - { - "href": versionbaseurl + ".diff.html", - "rel": "alternate", - "type": "text/html", - "title": "HTML with author colors", - } - ) - if args.all or args.meta: - links.append( - { - "href": versionbaseurl + ".meta.json", - "rel": "alternate", - "type": "application/json", - "title": "Meta data", - } - ) - - # links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"}) - if args.all or args.dhtml: - data['startRev'] = "0" - html = getjson( - info['localapiurl'] + 'createDiffHTML?' + urlencode(data) - ) - ver = {"type": "diffhtml"} - versions.append(ver) - ver["code"] = html["_code"] - if html["_code"] == 200: - try: - html = html['data']['html'] - ver["path"] = p + ".diff.html" - ver["url"] = quote(ver["path"]) - # doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False) - doc = html5lib.parse( - html, treebuilder="etree", namespaceHTMLElements=False - ) - html5tidy( - doc, - indent=True, - title=padid, - scripts=args.script, - links=links, - ) - with open(ver["path"], "w") as f: - print( - ET.tostring(doc, method="html", encoding="unicode"), - file=f, - ) - except TypeError: - # Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file! - ver["message"] = html["message"] - # with open(ver["path"], "w") as f: - # print ("""
{0}
""".format(json.dumps(html, indent=2)), file=f) - - # Process text, html, dhtml, all options - if args.all or args.html: - html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data)) - ver = {"type": "html"} - versions.append(ver) - ver["code"] = html["_code"] - if html["_code"] == 200: + if args.all or args.dhtml: + data['startRev'] = "0" + url = info['localapiurl'] + 'createDiffHTML?' + urlencode(data) + html = await agetjson(session, url) + ver = {"type": "diffhtml"} + versions.append(ver) + ver["code"] = html["_code"] + if html["_code"] == 200: + try: html = html['data']['html'] - ver["path"] = p + ".raw.html" + ver["path"] = p + ".diff.html" ver["url"] = quote(ver["path"]) doc = html5lib.parse( html, treebuilder="etree", namespaceHTMLElements=False @@ -456,12 +410,68 @@ def main(args): ET.tostring(doc, method="html", encoding="unicode"), file=f, ) + except TypeError: + ver["message"] = html["message"] + + # Process text, html, dhtml, all options + if args.all or args.html: + url = info['localapiurl'] + 'getHTML?' + urlencode(data) + html = await agetjson(session, url) + ver = {"type": "html"} + versions.append(ver) + ver["code"] = html["_code"] + if html["_code"] == 200: + html = html['data']['html'] + ver["path"] = p + ".raw.html" + ver["url"] = quote(ver["path"]) + doc = html5lib.parse( + html, treebuilder="etree", namespaceHTMLElements=False + ) + html5tidy( + doc, indent=True, title=padid, scripts=args.script, links=links, + ) + with open(ver["path"], "w") as f: + print( + ET.tostring(doc, method="html", encoding="unicode"), file=f, + ) + + # output meta + if args.all or args.meta: + ver = {"type": "meta"} + versions.append(ver) + ver["path"] = metapath + ver["url"] = quote(metapath) + with open(metapath, "w") as f: + json.dump(meta, f, indent=2) - # output meta - if args.all or args.meta: - ver = {"type": "meta"} - versions.append(ver) - ver["path"] = metapath - ver["url"] = quote(metapath) - with open(metapath, "w") as f: - json.dump(meta, f, indent=2) + +async def handle_pad_chunk(args, padids, data, info, session): + progress_kwargs = {} + if not istty(): + progress_kwargs.update(dict(disable=True)) + + padids = tqdm(iterable=padids, total=len(padids), **progress_kwargs,) + for padid in padids: + await handle_pad(args, padid, data, info, session) + + +async def handle_pads(args): + session = asks.Session(connections=args.connection) + info = loadpadinfo(args.padinfo) + data = {'apikey': info['apikey']} + + padids = await get_padids(args, info, data, session) + if args.skip: + padids = padids[args.skip : len(padids)] + CHUNK_SIZE = math.ceil(len(padids) / 3) + + async with trio.open_nursery() as nursery: + for padids in chunks(padids, CHUNK_SIZE): + _args = (args, padids, data, info, session) + nursery.start_soon(handle_pad_chunk, *_args) + + +def main(args): + p = build_argument_parser(args) + args = p.parse_args(args) + trio.run(handle_pads, args) diff --git a/setup.py b/setup.py index fec08ae..e7aec8b 100644 --- a/setup.py +++ b/setup.py @@ -43,13 +43,16 @@ setup( long_description=long_description, long_description_content_type='text/markdown', install_requires=[ + "asks", "html5lib", "jinja2", - "python-dateutil", "pypandoc", - "tqdm", + "python-dateutil", "requests", + "tqdm", + "trio", ], + python_requires=">=3.5", classifiers=[ 'Programming Language :: Python :: 3', 'Environment :: Console',