Add async support for pull.py #8

Merged
decentral1se merged 6 commits from :asyncpads into master 5 years ago
  1. 8
      README.md
  2. 33
      etherpump/commands/common.py
  3. 218
      etherpump/commands/pull.py
  4. 7
      setup.py

8
README.md

@ -30,6 +30,14 @@ After installing etherpump on the Varia server, we collectively decided to not w
Change log / notes
==================
** January 2020**
Added experimental [Trio](trio.readthedocs.io) support for the `pull` command
which enables pads to be processed concurrently. The default `--connection`
option is set to 20 which may overpower the target server. If in doubt, set
this to a lower number (like 5). This functionality is experimental, be
cautious and please report bugs!
**October 2019**
Improve `etherpump --help` handling to make it easier for new users.

33
etherpump/commands/common.py

@ -7,6 +7,8 @@ from time import sleep
from urllib.parse import quote_plus, unquote_plus
from urllib.request import HTTPError, urlopen
import trio
groupnamepat = re.compile(r"^g\.(\w+)\$")
@ -73,6 +75,27 @@ def getjson(url, max_retry=3, retry_sleep_time=3):
return ret
async def agetjson(session, url):
"""The asynchronous version of getjson."""
RETRY = 20
TIMEOUT = 10
ret = {}
ret["_retries"] = 0
try:
response = await session.get(url, timeout=TIMEOUT, retries=RETRY)
rurl = response.url
ret.update(response.json())
ret["_code"] = response.status_code
if rurl != url:
ret["_url"] = rurl
return ret
except Exception as e:
print('Failed to download {}, saw {}'.format(url, str(e)))
return
def loadpadinfo(p):
with open(p) as f:
info = json.load(f)
@ -81,7 +104,10 @@ def loadpadinfo(p):
return info
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
# Python developer Fredrik Lundh (author of elementtree, among other things)
# has such a function on his website, which works with decimal, hex and named
# entities:
##
# Removes HTML or XML character references and entities from a text string.
#
@ -112,3 +138,8 @@ def unescape(text):
def istty():
return sys.stdout.isatty() and os.environ.get('TERM') != 'dumb'
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i : i + n]

218
etherpump/commands/pull.py

@ -1,18 +1,20 @@
"""Check for pads that have changed since last sync (according to .meta.json)"""
import json
import math
import os
import re
import sys
from argparse import ArgumentParser
from datetime import datetime
from fnmatch import fnmatch
from time import sleep
from urllib.parse import quote, urlencode
from urllib.request import HTTPError
from xml.etree import ElementTree as ET
import asks
import html5lib
import trio
from tqdm import tqdm
from etherpump.commands.common import * # noqa
@ -36,170 +38,160 @@ def try_deleting(files):
pass
def main(args):
p = ArgumentParser(
def build_argument_parser(args):
parser = ArgumentParser(
"Check for pads that have changed since last sync (according to .meta.json)"
)
p.add_argument("padid", nargs="*", default=[])
p.add_argument(
parser.add_argument("padid", nargs="*", default=[])
parser.add_argument(
"--glob", default=False, help="download pads matching a glob pattern"
)
p.add_argument(
parser.add_argument(
"--padinfo",
default=".etherpump/settings.json",
help="settings, default: .etherpump/settings.json",
)
p.add_argument(
parser.add_argument(
"--zerorevs",
default=False,
action="store_true",
help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
)
p.add_argument(
parser.add_argument(
"--pub",
default="p",
help="folder to store files for public pads, default: p",
)
p.add_argument(
parser.add_argument(
"--group",
default="g",
help="folder to store files for group pads, default: g",
)
p.add_argument(
parser.add_argument(
"--skip",
default=None,
type=int,
help="skip this many items, default: None",
)
p.add_argument(
parser.add_argument(
"--connection",
default=5,
type=int,
help="number of connections to run concurrently",
)
parser.add_argument(
"--meta",
default=False,
action="store_true",
help="download meta to PADID.meta.json, default: False",
)
p.add_argument(
parser.add_argument(
"--text",
default=False,
action="store_true",
help="download text to PADID.txt, default: False",
)
p.add_argument(
parser.add_argument(
"--html",
default=False,
action="store_true",
help="download html to PADID.html, default: False",
)
p.add_argument(
parser.add_argument(
"--dhtml",
default=False,
action="store_true",
help="download dhtml to PADID.diff.html, default: False",
)
p.add_argument(
parser.add_argument(
"--all",
default=False,
action="store_true",
help="download all files (meta, text, html, dhtml), default: False",
)
p.add_argument(
parser.add_argument(
"--folder",
default=False,
action="store_true",
help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
)
p.add_argument(
parser.add_argument(
"--output",
default=False,
action="store_true",
help="output changed padids on stdout",
)
p.add_argument(
parser.add_argument(
"--force",
default=False,
action="store_true",
help="reload, even if revisions count matches previous",
)
p.add_argument(
parser.add_argument(
"--no-raw-ext",
default=False,
action="store_true",
help="save plain text as padname with no (additional) extension",
)
p.add_argument(
parser.add_argument(
"--fix-names",
default=False,
action="store_true",
help="normalize padid's (no spaces, special control chars) for use in file names",
)
p.add_argument(
parser.add_argument(
"--filter-ext", default=None, help="filter pads by extension"
)
p.add_argument(
parser.add_argument(
"--css",
default="/styles.css",
help="add css url to output pages, default: /styles.css",
)
p.add_argument(
parser.add_argument(
"--script",
default="/versions.js",
help="add script url to output pages, default: /versions.js",
)
p.add_argument(
parser.add_argument(
"--nopublish",
default="__NOPUBLISH__",
help="no publish magic word, default: __NOPUBLISH__",
)
p.add_argument(
parser.add_argument(
"--publish",
default="__PUBLISH__",
help="the publish magic word, default: __PUBLISH__",
)
p.add_argument(
parser.add_argument(
"--publish-opt-in",
default=False,
action="store_true",
help="ensure `--publish` is honoured instead of `--nopublish`",
)
return parser
args = p.parse_args(args)
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
info = loadpadinfo(args.padinfo)
data = {}
data['apikey'] = info['apikey']
async def get_padids(args, info, data, session):
if args.padid:
padids = args.padid
elif args.glob:
padids = getjson(
info['localapiurl'] + 'listAllPads?' + urlencode(data)
)['data']['padIDs']
url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
padids = await agetjson(session, url)
padids = padids['data']['padIDs']
padids = [x for x in padids if fnmatch(x, args.glob)]
else:
padids = getjson(
info['localapiurl'] + 'listAllPads?' + urlencode(data)
)['data']['padIDs']
url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
padids = await agetjson(session, url)
padids = padids['data']['padIDs']
padids.sort()
numpads = len(padids)
# maxmsglen = 0
count = 0
return padids
progress_kwargs = {}
if not istty():
progress_kwargs.update(dict(disable=True))
progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs)
for i, padid in enumerate(progress_pads):
if args.skip != None and i < args.skip:
continue
async def handle_pad(args, padid, data, info, session):
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
data['padID'] = padid
p = padpath(padid, args.pub, args.group, args.fix_names)
@ -218,11 +210,11 @@ def main(args):
if os.path.exists(metapath):
with open(metapath) as f:
meta.update(json.load(f))
revisions = getjson(
info['localapiurl']
+ 'getRevisionsCount?'
+ urlencode(data)
)['data']['revisions']
url = (
info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
)
response = await agetjson(session, url)
revisions = response['data']['revisions']
if meta['revisions'] == revisions and not args.force:
skip = True
break
@ -230,19 +222,15 @@ def main(args):
meta['padid'] = padid
versions = meta["versions"] = []
versions.append(
{
"url": padurlbase + quote(padid),
"type": "pad",
"code": 200,
}
{"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
)
if revisions == None:
meta['revisions'] = getjson(
info['localapiurl']
+ 'getRevisionsCount?'
+ urlencode(data)
)['data']['revisions']
if revisions is None:
url = (
info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
)
response = await agetjson(session, url)
meta['revisions'] = response['data']['revisions']
else:
meta['revisions'] = revisions
@ -253,17 +241,19 @@ def main(args):
# todo: load more metadata!
meta['group'], meta['pad'] = splitpadname(padid)
meta['pathbase'] = p
meta['lastedited_raw'] = int(
getjson(
info['localapiurl'] + 'getLastEdited?' + urlencode(data)
)['data']['lastEdited']
)
url = info['localapiurl'] + 'getLastEdited?' + urlencode(data)
response = await agetjson(session, url)
meta['lastedited_raw'] = int(response['data']['lastEdited'])
meta['lastedited_iso'] = datetime.fromtimestamp(
int(meta['lastedited_raw']) / 1000
).isoformat()
meta['author_ids'] = getjson(
info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
)['data']['authorIDs']
url = info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
response = await agetjson(session, url)
meta['author_ids'] = response['data']['authorIDs']
break
except HTTPError as e:
tries += 1
@ -275,7 +265,7 @@ def main(args):
skip = True
break
else:
sleep(3)
await trio.sleep(3)
except TypeError as e:
print(
"Type Error loading pad {0} (phantom pad?), skipping".format(
@ -287,9 +277,7 @@ def main(args):
break
if skip:
continue
count += 1
return
if args.output:
print(padid)
@ -301,7 +289,8 @@ def main(args):
pass
if args.all or args.text:
text = getjson(info['localapiurl'] + 'getText?' + urlencode(data))
url = info['localapiurl'] + 'getText?' + urlencode(data)
text = await agetjson(session, url)
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
@ -321,7 +310,7 @@ def main(args):
p + ".meta.json",
)
)
continue
return
##########################################
## ENFORCE __PUBLISH__ MAGIC WORD
@ -335,7 +324,7 @@ def main(args):
p + ".meta.json",
)
)
continue
return
ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"])
@ -394,13 +383,10 @@ def main(args):
}
)
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
if args.all or args.dhtml:
data['startRev'] = "0"
html = getjson(
info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
)
url = info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
@ -409,7 +395,6 @@ def main(args):
html = html['data']['html']
ver["path"] = p + ".diff.html"
ver["url"] = quote(ver["path"])
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
@ -426,14 +411,12 @@ def main(args):
file=f,
)
except TypeError:
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
ver["message"] = html["message"]
# with open(ver["path"], "w") as f:
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
# Process text, html, dhtml, all options
if args.all or args.html:
html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data))
url = info['localapiurl'] + 'getHTML?' + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
@ -445,16 +428,11 @@ def main(args):
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc,
indent=True,
title=padid,
scripts=args.script,
links=links,
doc, indent=True, title=padid, scripts=args.script, links=links,
)
with open(ver["path"], "w") as f:
print(
ET.tostring(doc, method="html", encoding="unicode"),
file=f,
ET.tostring(doc, method="html", encoding="unicode"), file=f,
)
# output meta
@ -465,3 +443,35 @@ def main(args):
ver["url"] = quote(metapath)
with open(metapath, "w") as f:
json.dump(meta, f, indent=2)
async def handle_pad_chunk(args, padids, data, info, session):
progress_kwargs = {}
if not istty():
progress_kwargs.update(dict(disable=True))
padids = tqdm(iterable=padids, total=len(padids), **progress_kwargs,)
for padid in padids:
await handle_pad(args, padid, data, info, session)
async def handle_pads(args):
session = asks.Session(connections=args.connection)
info = loadpadinfo(args.padinfo)
data = {'apikey': info['apikey']}
padids = await get_padids(args, info, data, session)
if args.skip:
padids = padids[args.skip : len(padids)]
CHUNK_SIZE = math.ceil(len(padids) / 3)
async with trio.open_nursery() as nursery:
for padids in chunks(padids, CHUNK_SIZE):
_args = (args, padids, data, info, session)
nursery.start_soon(handle_pad_chunk, *_args)
def main(args):
p = build_argument_parser(args)
args = p.parse_args(args)
trio.run(handle_pads, args)

7
setup.py

@ -43,13 +43,16 @@ setup(
long_description=long_description,
long_description_content_type='text/markdown',
install_requires=[
"asks",
"html5lib",
"jinja2",
"python-dateutil",
"pypandoc",
"tqdm",
"python-dateutil",
"requests",
"tqdm",
"trio",
],
python_requires=">=3.5",
classifiers=[
'Programming Language :: Python :: 3',
'Environment :: Console',

Loading…
Cancel
Save