Merge branch 'asyncpads' of decentral1se/etherpump into master

2020-01-20 17:24:24 +01:00 · 2020-01-20 17:24:24 +01:00 · 05875dd046
commit 05875dd046
parent e1bce18d2d 7b4322aa60
4 changed files with 333 additions and 281 deletions
--- a/README.md
+++ b/README.md
@ -30,6 +30,14 @@ After installing etherpump on the Varia server, we collectively decided to not w
 Change log / notes
 ==================

+** January 2020**
+
+Added experimental [Trio](trio.readthedocs.io) support for the `pull` command
+which enables pads to be processed concurrently. The default `--connection`
+option is set to 20 which may overpower the target server. If in doubt, set
+this to a lower number (like 5). This functionality is experimental, be
+cautious and please report bugs!
+
 **October 2019**

 Improve `etherpump --help` handling to make it easier for new users.
--- a/etherpump/commands/common.py
+++ b/etherpump/commands/common.py
@ -7,6 +7,8 @@ from time import sleep
 from urllib.parse import quote_plus, unquote_plus
 from urllib.request import HTTPError, urlopen

+import trio
+
 groupnamepat = re.compile(r"^g\.(\w+)\$")


@ -73,6 +75,27 @@ def getjson(url, max_retry=3, retry_sleep_time=3):
    return ret


+async def agetjson(session, url):
+    """The asynchronous version of getjson."""
+    RETRY = 20
+    TIMEOUT = 10
+
+    ret = {}
+    ret["_retries"] = 0
+
+    try:
+        response = await session.get(url, timeout=TIMEOUT, retries=RETRY)
+        rurl = response.url
+        ret.update(response.json())
+        ret["_code"] = response.status_code
+        if rurl != url:
+            ret["_url"] = rurl
+        return ret
+    except Exception as e:
+        print('Failed to download {}, saw {}'.format(url, str(e)))
+        return
+
+
 def loadpadinfo(p):
    with open(p) as f:
        info = json.load(f)
@ -81,7 +104,10 @@ def loadpadinfo(p):
    return info


-# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
+# Python developer Fredrik Lundh (author of elementtree, among other things)
+# has such a function on his website, which works with decimal, hex and named
+# entities:
+
 ##
 # Removes HTML or XML character references and entities from a text string.
 #
@ -112,3 +138,8 @@ def unescape(text):

 def istty():
    return sys.stdout.isatty() and os.environ.get('TERM') != 'dumb'
+
+
+def chunks(lst, n):
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
--- a/etherpump/commands/pull.py
+++ b/etherpump/commands/pull.py
@ -1,18 +1,20 @@
 """Check for pads that have changed since last sync (according to .meta.json)"""

 import json
+import math
 import os
 import re
 import sys
 from argparse import ArgumentParser
 from datetime import datetime
 from fnmatch import fnmatch
-from time import sleep
 from urllib.parse import quote, urlencode
 from urllib.request import HTTPError
 from xml.etree import ElementTree as ET

+import asks
 import html5lib
+import trio
 from tqdm import tqdm

 from etherpump.commands.common import *  # noqa
@ -36,410 +38,362 @@ def try_deleting(files):
            pass


-def main(args):
-    p = ArgumentParser(
+def build_argument_parser(args):
+    parser = ArgumentParser(
        "Check for pads that have changed since last sync (according to .meta.json)"
    )
-
-    p.add_argument("padid", nargs="*", default=[])
-    p.add_argument(
+    parser.add_argument("padid", nargs="*", default=[])
+    parser.add_argument(
        "--glob", default=False, help="download pads matching a glob pattern"
    )
-
-    p.add_argument(
+    parser.add_argument(
        "--padinfo",
        default=".etherpump/settings.json",
        help="settings, default: .etherpump/settings.json",
    )
-    p.add_argument(
+    parser.add_argument(
        "--zerorevs",
        default=False,
        action="store_true",
        help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
    )
-    p.add_argument(
+    parser.add_argument(
        "--pub",
        default="p",
        help="folder to store files for public pads, default: p",
    )
-    p.add_argument(
+    parser.add_argument(
        "--group",
        default="g",
        help="folder to store files for group pads, default: g",
    )
-    p.add_argument(
+    parser.add_argument(
        "--skip",
        default=None,
        type=int,
        help="skip this many items, default: None",
    )
-    p.add_argument(
+    parser.add_argument(
+        "--connection",
+        default=5,
+        type=int,
+        help="number of connections to run concurrently",
+    )
+    parser.add_argument(
        "--meta",
        default=False,
        action="store_true",
        help="download meta to PADID.meta.json, default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--text",
        default=False,
        action="store_true",
        help="download text to PADID.txt, default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--html",
        default=False,
        action="store_true",
        help="download html to PADID.html, default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--dhtml",
        default=False,
        action="store_true",
        help="download dhtml to PADID.diff.html, default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--all",
        default=False,
        action="store_true",
        help="download all files (meta, text, html, dhtml), default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--folder",
        default=False,
        action="store_true",
        help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
    )
-    p.add_argument(
+    parser.add_argument(
        "--output",
        default=False,
        action="store_true",
        help="output changed padids on stdout",
    )
-    p.add_argument(
+    parser.add_argument(
        "--force",
        default=False,
        action="store_true",
        help="reload, even if revisions count matches previous",
    )
-    p.add_argument(
+    parser.add_argument(
        "--no-raw-ext",
        default=False,
        action="store_true",
        help="save plain text as padname with no (additional) extension",
    )
-    p.add_argument(
+    parser.add_argument(
        "--fix-names",
        default=False,
        action="store_true",
        help="normalize padid's (no spaces, special control chars) for use in file names",
    )
-
-    p.add_argument(
+    parser.add_argument(
        "--filter-ext", default=None, help="filter pads by extension"
    )
-
-    p.add_argument(
+    parser.add_argument(
        "--css",
        default="/styles.css",
        help="add css url to output pages, default: /styles.css",
    )
-    p.add_argument(
+    parser.add_argument(
        "--script",
        default="/versions.js",
        help="add script url to output pages, default: /versions.js",
    )
-
-    p.add_argument(
+    parser.add_argument(
        "--nopublish",
        default="__NOPUBLISH__",
        help="no publish magic word, default: __NOPUBLISH__",
    )
-    p.add_argument(
+    parser.add_argument(
        "--publish",
        default="__PUBLISH__",
        help="the publish magic word, default: __PUBLISH__",
    )
-    p.add_argument(
+    parser.add_argument(
        "--publish-opt-in",
        default=False,
        action="store_true",
        help="ensure `--publish` is honoured instead of `--nopublish`",
    )
+    return parser

-    args = p.parse_args(args)

+async def get_padids(args, info, data, session):
+    if args.padid:
+        padids = args.padid
+    elif args.glob:
+        url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
+        padids = await agetjson(session, url)
+        padids = padids['data']['padIDs']
+        padids = [x for x in padids if fnmatch(x, args.glob)]
+    else:
+        url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
+        padids = await agetjson(session, url)
+        padids = padids['data']['padIDs']
+
+    padids.sort()
+    return padids
+
+
+async def handle_pad(args, padid, data, info, session):
    raw_ext = ".raw.txt"
    if args.no_raw_ext:
        raw_ext = ""

-    info = loadpadinfo(args.padinfo)
-    data = {}
-    data['apikey'] = info['apikey']
+    data['padID'] = padid
+    p = padpath(padid, args.pub, args.group, args.fix_names)
+    if args.folder:
+        p = os.path.join(p, padid)

-    if args.padid:
-        padids = args.padid
-    elif args.glob:
-        padids = getjson(
-            info['localapiurl'] + 'listAllPads?' + urlencode(data)
-        )['data']['padIDs']
-        padids = [x for x in padids if fnmatch(x, args.glob)]
-    else:
-        padids = getjson(
-            info['localapiurl'] + 'listAllPads?' + urlencode(data)
-        )['data']['padIDs']
-    padids.sort()
-    numpads = len(padids)
-    # maxmsglen = 0
-    count = 0
+    metapath = p + ".meta.json"
+    revisions = None
+    tries = 1
+    skip = False
+    padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
+    meta = {}

-    progress_kwargs = {}
-    if not istty():
-        progress_kwargs.update(dict(disable=True))
-    progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs)
-
-    for i, padid in enumerate(progress_pads):
-        if args.skip != None and i < args.skip:
-            continue
-
-        data['padID'] = padid
-        p = padpath(padid, args.pub, args.group, args.fix_names)
-        if args.folder:
-            p = os.path.join(p, padid)
-
-        metapath = p + ".meta.json"
-        revisions = None
-        tries = 1
-        skip = False
-        padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
-        meta = {}
-
-        while True:
-            try:
-                if os.path.exists(metapath):
-                    with open(metapath) as f:
-                        meta.update(json.load(f))
-                    revisions = getjson(
-                        info['localapiurl']
-                        + 'getRevisionsCount?'
-                        + urlencode(data)
-                    )['data']['revisions']
-                    if meta['revisions'] == revisions and not args.force:
-                        skip = True
-                        break
-
-                meta['padid'] = padid
-                versions = meta["versions"] = []
-                versions.append(
-                    {
-                        "url": padurlbase + quote(padid),
-                        "type": "pad",
-                        "code": 200,
-                    }
+    while True:
+        try:
+            if os.path.exists(metapath):
+                with open(metapath) as f:
+                    meta.update(json.load(f))
+                url = (
+                    info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
                )
-
-                if revisions == None:
-                    meta['revisions'] = getjson(
-                        info['localapiurl']
-                        + 'getRevisionsCount?'
-                        + urlencode(data)
-                    )['data']['revisions']
-                else:
-                    meta['revisions'] = revisions
-
-                if (meta['revisions'] == 0) and (not args.zerorevs):
+                response = await agetjson(session, url)
+                revisions = response['data']['revisions']
+                if meta['revisions'] == revisions and not args.force:
                    skip = True
                    break

-                # todo: load more metadata!
-                meta['group'], meta['pad'] = splitpadname(padid)
-                meta['pathbase'] = p
-                meta['lastedited_raw'] = int(
-                    getjson(
-                        info['localapiurl'] + 'getLastEdited?' + urlencode(data)
-                    )['data']['lastEdited']
+            meta['padid'] = padid
+            versions = meta["versions"] = []
+            versions.append(
+                {"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
+            )
+
+            if revisions is None:
+                url = (
+                    info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
                )
-                meta['lastedited_iso'] = datetime.fromtimestamp(
-                    int(meta['lastedited_raw']) / 1000
-                ).isoformat()
-                meta['author_ids'] = getjson(
-                    info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
-                )['data']['authorIDs']
+                response = await agetjson(session, url)
+                meta['revisions'] = response['data']['revisions']
+            else:
+                meta['revisions'] = revisions
+
+            if (meta['revisions'] == 0) and (not args.zerorevs):
+                skip = True
                break
-            except HTTPError as e:
-                tries += 1
-                if tries > 3:
-                    print(
-                        "Too many failures ({0}), skipping".format(padid),
-                        file=sys.stderr,
-                    )
-                    skip = True
-                    break
-                else:
-                    sleep(3)
-            except TypeError as e:
+
+            # todo: load more metadata!
+            meta['group'], meta['pad'] = splitpadname(padid)
+            meta['pathbase'] = p
+
+            url = info['localapiurl'] + 'getLastEdited?' + urlencode(data)
+            response = await agetjson(session, url)
+            meta['lastedited_raw'] = int(response['data']['lastEdited'])
+
+            meta['lastedited_iso'] = datetime.fromtimestamp(
+                int(meta['lastedited_raw']) / 1000
+            ).isoformat()
+
+            url = info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
+            response = await agetjson(session, url)
+            meta['author_ids'] = response['data']['authorIDs']
+
+            break
+        except HTTPError as e:
+            tries += 1
+            if tries > 3:
                print(
-                    "Type Error loading pad {0} (phantom pad?), skipping".format(
-                        padid
-                    ),
+                    "Too many failures ({0}), skipping".format(padid),
                    file=sys.stderr,
                )
                skip = True
                break
+            else:
+                await trio.sleep(3)
+        except TypeError as e:
+            print(
+                "Type Error loading pad {0} (phantom pad?), skipping".format(
+                    padid
+                ),
+                file=sys.stderr,
+            )
+            skip = True
+            break

-        if skip:
-            continue
+    if skip:
+        return

-        count += 1
+    if args.output:
+        print(padid)

-        if args.output:
-            print(padid)
+    if args.all or (args.meta or args.text or args.html or args.dhtml):
+        try:
+            os.makedirs(os.path.split(metapath)[0])
+        except OSError:
+            pass

-        if args.all or (args.meta or args.text or args.html or args.dhtml):
-            try:
-                os.makedirs(os.path.split(metapath)[0])
-            except OSError:
-                pass
+    if args.all or args.text:
+        url = info['localapiurl'] + 'getText?' + urlencode(data)
+        text = await agetjson(session, url)
+        ver = {"type": "text"}
+        versions.append(ver)
+        ver["code"] = text["_code"]
+        if text["_code"] == 200:
+            text = text['data']['text']

-        if args.all or args.text:
-            text = getjson(info['localapiurl'] + 'getText?' + urlencode(data))
-            ver = {"type": "text"}
-            versions.append(ver)
-            ver["code"] = text["_code"]
-            if text["_code"] == 200:
-                text = text['data']['text']
-
-                ##########################################
-                ## ENFORCE __NOPUBLISH__ MAGIC WORD
-                ##########################################
-                if args.nopublish and args.nopublish in text:
-                    # NEED TO PURGE ANY EXISTING DOCS
-                    try_deleting(
-                        (
-                            p + raw_ext,
-                            p + ".raw.html",
-                            p + ".diff.html",
-                            p + ".meta.json",
-                        )
+            ##########################################
+            ## ENFORCE __NOPUBLISH__ MAGIC WORD
+            ##########################################
+            if args.nopublish and args.nopublish in text:
+                # NEED TO PURGE ANY EXISTING DOCS
+                try_deleting(
+                    (
+                        p + raw_ext,
+                        p + ".raw.html",
+                        p + ".diff.html",
+                        p + ".meta.json",
                    )
-                    continue
+                )
+                return

-                ##########################################
-                ## ENFORCE __PUBLISH__ MAGIC WORD
-                ##########################################
-                if args.publish_opt_in and args.publish not in text:
-                    try_deleting(
-                        (
-                            p + raw_ext,
-                            p + ".raw.html",
-                            p + ".diff.html",
-                            p + ".meta.json",
-                        )
+            ##########################################
+            ## ENFORCE __PUBLISH__ MAGIC WORD
+            ##########################################
+            if args.publish_opt_in and args.publish not in text:
+                try_deleting(
+                    (
+                        p + raw_ext,
+                        p + ".raw.html",
+                        p + ".diff.html",
+                        p + ".meta.json",
                    )
-                    continue
+                )
+                return

-                ver["path"] = p + raw_ext
-                ver["url"] = quote(ver["path"])
-                with open(ver["path"], "w") as f:
-                    f.write(text)
-                # once the content is settled, compute a hash
-                # and link it in the metadata!
+            ver["path"] = p + raw_ext
+            ver["url"] = quote(ver["path"])
+            with open(ver["path"], "w") as f:
+                f.write(text)
+            # once the content is settled, compute a hash
+            # and link it in the metadata!

-        links = []
-        if args.css:
-            links.append({"href": args.css, "rel": "stylesheet"})
-        # todo, make this process reflect which files actually were made
-        versionbaseurl = quote(padid)
+    links = []
+    if args.css:
+        links.append({"href": args.css, "rel": "stylesheet"})
+    # todo, make this process reflect which files actually were made
+    versionbaseurl = quote(padid)
+    links.append(
+        {
+            "href": versions[0]["url"],
+            "rel": "alternate",
+            "type": "text/html",
+            "title": "Etherpad",
+        }
+    )
+    if args.all or args.text:
        links.append(
            {
-                "href": versions[0]["url"],
+                "href": versionbaseurl + raw_ext,
                "rel": "alternate",
-                "type": "text/html",
-                "title": "Etherpad",
+                "type": "text/plain",
+                "title": "Plain text",
+            }
+        )
+    if args.all or args.html:
+        links.append(
+            {
+                "href": versionbaseurl + ".raw.html",
+                "rel": "alternate",
+                "type": "text/html",
+                "title": "HTML",
+            }
+        )
+    if args.all or args.dhtml:
+        links.append(
+            {
+                "href": versionbaseurl + ".diff.html",
+                "rel": "alternate",
+                "type": "text/html",
+                "title": "HTML with author colors",
+            }
+        )
+    if args.all or args.meta:
+        links.append(
+            {
+                "href": versionbaseurl + ".meta.json",
+                "rel": "alternate",
+                "type": "application/json",
+                "title": "Meta data",
            }
        )
-        if args.all or args.text:
-            links.append(
-                {
-                    "href": versionbaseurl + raw_ext,
-                    "rel": "alternate",
-                    "type": "text/plain",
-                    "title": "Plain text",
-                }
-            )
-        if args.all or args.html:
-            links.append(
-                {
-                    "href": versionbaseurl + ".raw.html",
-                    "rel": "alternate",
-                    "type": "text/html",
-                    "title": "HTML",
-                }
-            )
-        if args.all or args.dhtml:
-            links.append(
-                {
-                    "href": versionbaseurl + ".diff.html",
-                    "rel": "alternate",
-                    "type": "text/html",
-                    "title": "HTML with author colors",
-                }
-            )
-        if args.all or args.meta:
-            links.append(
-                {
-                    "href": versionbaseurl + ".meta.json",
-                    "rel": "alternate",
-                    "type": "application/json",
-                    "title": "Meta data",
-                }
-            )

-        # links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
-
-        if args.all or args.dhtml:
-            data['startRev'] = "0"
-            html = getjson(
-                info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
-            )
-            ver = {"type": "diffhtml"}
-            versions.append(ver)
-            ver["code"] = html["_code"]
-            if html["_code"] == 200:
-                try:
-                    html = html['data']['html']
-                    ver["path"] = p + ".diff.html"
-                    ver["url"] = quote(ver["path"])
-                    # doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
-                    doc = html5lib.parse(
-                        html, treebuilder="etree", namespaceHTMLElements=False
-                    )
-                    html5tidy(
-                        doc,
-                        indent=True,
-                        title=padid,
-                        scripts=args.script,
-                        links=links,
-                    )
-                    with open(ver["path"], "w") as f:
-                        print(
-                            ET.tostring(doc, method="html", encoding="unicode"),
-                            file=f,
-                        )
-                except TypeError:
-                    # Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
-                    ver["message"] = html["message"]
-                    # with open(ver["path"], "w") as f:
-                    #     print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
-
-        # Process text, html, dhtml, all options
-        if args.all or args.html:
-            html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data))
-            ver = {"type": "html"}
-            versions.append(ver)
-            ver["code"] = html["_code"]
-            if html["_code"] == 200:
+    if args.all or args.dhtml:
+        data['startRev'] = "0"
+        url = info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
+        html = await agetjson(session, url)
+        ver = {"type": "diffhtml"}
+        versions.append(ver)
+        ver["code"] = html["_code"]
+        if html["_code"] == 200:
+            try:
                html = html['data']['html']
-                ver["path"] = p + ".raw.html"
+                ver["path"] = p + ".diff.html"
                ver["url"] = quote(ver["path"])
                doc = html5lib.parse(
                    html, treebuilder="etree", namespaceHTMLElements=False
@ -456,12 +410,68 @@ def main(args):
                        ET.tostring(doc, method="html", encoding="unicode"),
                        file=f,
                    )
+            except TypeError:
+                ver["message"] = html["message"]

-        # output meta
-        if args.all or args.meta:
-            ver = {"type": "meta"}
-            versions.append(ver)
-            ver["path"] = metapath
-            ver["url"] = quote(metapath)
-            with open(metapath, "w") as f:
-                json.dump(meta, f, indent=2)
+    # Process text, html, dhtml, all options
+    if args.all or args.html:
+        url = info['localapiurl'] + 'getHTML?' + urlencode(data)
+        html = await agetjson(session, url)
+        ver = {"type": "html"}
+        versions.append(ver)
+        ver["code"] = html["_code"]
+        if html["_code"] == 200:
+            html = html['data']['html']
+            ver["path"] = p + ".raw.html"
+            ver["url"] = quote(ver["path"])
+            doc = html5lib.parse(
+                html, treebuilder="etree", namespaceHTMLElements=False
+            )
+            html5tidy(
+                doc, indent=True, title=padid, scripts=args.script, links=links,
+            )
+            with open(ver["path"], "w") as f:
+                print(
+                    ET.tostring(doc, method="html", encoding="unicode"), file=f,
+                )
+
+    # output meta
+    if args.all or args.meta:
+        ver = {"type": "meta"}
+        versions.append(ver)
+        ver["path"] = metapath
+        ver["url"] = quote(metapath)
+        with open(metapath, "w") as f:
+            json.dump(meta, f, indent=2)
+
+
+async def handle_pad_chunk(args, padids, data, info, session):
+    progress_kwargs = {}
+    if not istty():
+        progress_kwargs.update(dict(disable=True))
+
+    padids = tqdm(iterable=padids, total=len(padids), **progress_kwargs,)
+    for padid in padids:
+        await handle_pad(args, padid, data, info, session)
+
+
+async def handle_pads(args):
+    session = asks.Session(connections=args.connection)
+    info = loadpadinfo(args.padinfo)
+    data = {'apikey': info['apikey']}
+
+    padids = await get_padids(args, info, data, session)
+    if args.skip:
+        padids = padids[args.skip : len(padids)]
+    CHUNK_SIZE = math.ceil(len(padids) / 3)
+
+    async with trio.open_nursery() as nursery:
+        for padids in chunks(padids, CHUNK_SIZE):
+            _args = (args, padids, data, info, session)
+            nursery.start_soon(handle_pad_chunk, *_args)
+
+
+def main(args):
+    p = build_argument_parser(args)
+    args = p.parse_args(args)
+    trio.run(handle_pads, args)
--- a/setup.py
+++ b/setup.py
@ -43,13 +43,16 @@ setup(
    long_description=long_description,
    long_description_content_type='text/markdown',
    install_requires=[
+        "asks",
        "html5lib",
        "jinja2",
-        "python-dateutil",
        "pypandoc",
-        "tqdm",
+        "python-dateutil",
        "requests",
+        "tqdm",
+        "trio",
    ],
+    python_requires=">=3.5",
    classifiers=[
        'Programming Language :: Python :: 3',
        'Environment :: Console',