Merge branch 'asyncpads' of decentral1se/etherpump into master
This commit is contained in:
commit
05875dd046
@ -30,6 +30,14 @@ After installing etherpump on the Varia server, we collectively decided to not w
|
||||
Change log / notes
|
||||
==================
|
||||
|
||||
** January 2020**
|
||||
|
||||
Added experimental [Trio](trio.readthedocs.io) support for the `pull` command
|
||||
which enables pads to be processed concurrently. The default `--connection`
|
||||
option is set to 20 which may overpower the target server. If in doubt, set
|
||||
this to a lower number (like 5). This functionality is experimental, be
|
||||
cautious and please report bugs!
|
||||
|
||||
**October 2019**
|
||||
|
||||
Improve `etherpump --help` handling to make it easier for new users.
|
||||
|
@ -7,6 +7,8 @@ from time import sleep
|
||||
from urllib.parse import quote_plus, unquote_plus
|
||||
from urllib.request import HTTPError, urlopen
|
||||
|
||||
import trio
|
||||
|
||||
groupnamepat = re.compile(r"^g\.(\w+)\$")
|
||||
|
||||
|
||||
@ -73,6 +75,27 @@ def getjson(url, max_retry=3, retry_sleep_time=3):
|
||||
return ret
|
||||
|
||||
|
||||
async def agetjson(session, url):
|
||||
"""The asynchronous version of getjson."""
|
||||
RETRY = 20
|
||||
TIMEOUT = 10
|
||||
|
||||
ret = {}
|
||||
ret["_retries"] = 0
|
||||
|
||||
try:
|
||||
response = await session.get(url, timeout=TIMEOUT, retries=RETRY)
|
||||
rurl = response.url
|
||||
ret.update(response.json())
|
||||
ret["_code"] = response.status_code
|
||||
if rurl != url:
|
||||
ret["_url"] = rurl
|
||||
return ret
|
||||
except Exception as e:
|
||||
print('Failed to download {}, saw {}'.format(url, str(e)))
|
||||
return
|
||||
|
||||
|
||||
def loadpadinfo(p):
|
||||
with open(p) as f:
|
||||
info = json.load(f)
|
||||
@ -81,7 +104,10 @@ def loadpadinfo(p):
|
||||
return info
|
||||
|
||||
|
||||
# Python developer Fredrik Lundh (author of elementtree, among other things) has such a function on his website, which works with decimal, hex and named entities:
|
||||
# Python developer Fredrik Lundh (author of elementtree, among other things)
|
||||
# has such a function on his website, which works with decimal, hex and named
|
||||
# entities:
|
||||
|
||||
##
|
||||
# Removes HTML or XML character references and entities from a text string.
|
||||
#
|
||||
@ -112,3 +138,8 @@ def unescape(text):
|
||||
|
||||
def istty():
|
||||
return sys.stdout.isatty() and os.environ.get('TERM') != 'dumb'
|
||||
|
||||
|
||||
def chunks(lst, n):
|
||||
for i in range(0, len(lst), n):
|
||||
yield lst[i : i + n]
|
||||
|
@ -1,18 +1,20 @@
|
||||
"""Check for pads that have changed since last sync (according to .meta.json)"""
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from datetime import datetime
|
||||
from fnmatch import fnmatch
|
||||
from time import sleep
|
||||
from urllib.parse import quote, urlencode
|
||||
from urllib.request import HTTPError
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
import asks
|
||||
import html5lib
|
||||
import trio
|
||||
from tqdm import tqdm
|
||||
|
||||
from etherpump.commands.common import * # noqa
|
||||
@ -36,410 +38,362 @@ def try_deleting(files):
|
||||
pass
|
||||
|
||||
|
||||
def main(args):
|
||||
p = ArgumentParser(
|
||||
def build_argument_parser(args):
|
||||
parser = ArgumentParser(
|
||||
"Check for pads that have changed since last sync (according to .meta.json)"
|
||||
)
|
||||
|
||||
p.add_argument("padid", nargs="*", default=[])
|
||||
p.add_argument(
|
||||
parser.add_argument("padid", nargs="*", default=[])
|
||||
parser.add_argument(
|
||||
"--glob", default=False, help="download pads matching a glob pattern"
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--padinfo",
|
||||
default=".etherpump/settings.json",
|
||||
help="settings, default: .etherpump/settings.json",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--zerorevs",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--pub",
|
||||
default="p",
|
||||
help="folder to store files for public pads, default: p",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--group",
|
||||
default="g",
|
||||
help="folder to store files for group pads, default: g",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--skip",
|
||||
default=None,
|
||||
type=int,
|
||||
help="skip this many items, default: None",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--connection",
|
||||
default=5,
|
||||
type=int,
|
||||
help="number of connections to run concurrently",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--meta",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="download meta to PADID.meta.json, default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="download text to PADID.txt, default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--html",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="download html to PADID.html, default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--dhtml",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="download dhtml to PADID.diff.html, default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--all",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="download all files (meta, text, html, dhtml), default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--folder",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="output changed padids on stdout",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="reload, even if revisions count matches previous",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--no-raw-ext",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="save plain text as padname with no (additional) extension",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--fix-names",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="normalize padid's (no spaces, special control chars) for use in file names",
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--filter-ext", default=None, help="filter pads by extension"
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--css",
|
||||
default="/styles.css",
|
||||
help="add css url to output pages, default: /styles.css",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--script",
|
||||
default="/versions.js",
|
||||
help="add script url to output pages, default: /versions.js",
|
||||
)
|
||||
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--nopublish",
|
||||
default="__NOPUBLISH__",
|
||||
help="no publish magic word, default: __NOPUBLISH__",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--publish",
|
||||
default="__PUBLISH__",
|
||||
help="the publish magic word, default: __PUBLISH__",
|
||||
)
|
||||
p.add_argument(
|
||||
parser.add_argument(
|
||||
"--publish-opt-in",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="ensure `--publish` is honoured instead of `--nopublish`",
|
||||
)
|
||||
return parser
|
||||
|
||||
args = p.parse_args(args)
|
||||
|
||||
async def get_padids(args, info, data, session):
|
||||
if args.padid:
|
||||
padids = args.padid
|
||||
elif args.glob:
|
||||
url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
|
||||
padids = await agetjson(session, url)
|
||||
padids = padids['data']['padIDs']
|
||||
padids = [x for x in padids if fnmatch(x, args.glob)]
|
||||
else:
|
||||
url = info['localapiurl'] + 'listAllPads?' + urlencode(data)
|
||||
padids = await agetjson(session, url)
|
||||
padids = padids['data']['padIDs']
|
||||
|
||||
padids.sort()
|
||||
return padids
|
||||
|
||||
|
||||
async def handle_pad(args, padid, data, info, session):
|
||||
raw_ext = ".raw.txt"
|
||||
if args.no_raw_ext:
|
||||
raw_ext = ""
|
||||
|
||||
info = loadpadinfo(args.padinfo)
|
||||
data = {}
|
||||
data['apikey'] = info['apikey']
|
||||
data['padID'] = padid
|
||||
p = padpath(padid, args.pub, args.group, args.fix_names)
|
||||
if args.folder:
|
||||
p = os.path.join(p, padid)
|
||||
|
||||
if args.padid:
|
||||
padids = args.padid
|
||||
elif args.glob:
|
||||
padids = getjson(
|
||||
info['localapiurl'] + 'listAllPads?' + urlencode(data)
|
||||
)['data']['padIDs']
|
||||
padids = [x for x in padids if fnmatch(x, args.glob)]
|
||||
else:
|
||||
padids = getjson(
|
||||
info['localapiurl'] + 'listAllPads?' + urlencode(data)
|
||||
)['data']['padIDs']
|
||||
padids.sort()
|
||||
numpads = len(padids)
|
||||
# maxmsglen = 0
|
||||
count = 0
|
||||
metapath = p + ".meta.json"
|
||||
revisions = None
|
||||
tries = 1
|
||||
skip = False
|
||||
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
|
||||
meta = {}
|
||||
|
||||
progress_kwargs = {}
|
||||
if not istty():
|
||||
progress_kwargs.update(dict(disable=True))
|
||||
progress_pads = tqdm(iterable=padids, total=len(padids), **progress_kwargs)
|
||||
|
||||
for i, padid in enumerate(progress_pads):
|
||||
if args.skip != None and i < args.skip:
|
||||
continue
|
||||
|
||||
data['padID'] = padid
|
||||
p = padpath(padid, args.pub, args.group, args.fix_names)
|
||||
if args.folder:
|
||||
p = os.path.join(p, padid)
|
||||
|
||||
metapath = p + ".meta.json"
|
||||
revisions = None
|
||||
tries = 1
|
||||
skip = False
|
||||
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
|
||||
meta = {}
|
||||
|
||||
while True:
|
||||
try:
|
||||
if os.path.exists(metapath):
|
||||
with open(metapath) as f:
|
||||
meta.update(json.load(f))
|
||||
revisions = getjson(
|
||||
info['localapiurl']
|
||||
+ 'getRevisionsCount?'
|
||||
+ urlencode(data)
|
||||
)['data']['revisions']
|
||||
if meta['revisions'] == revisions and not args.force:
|
||||
skip = True
|
||||
break
|
||||
|
||||
meta['padid'] = padid
|
||||
versions = meta["versions"] = []
|
||||
versions.append(
|
||||
{
|
||||
"url": padurlbase + quote(padid),
|
||||
"type": "pad",
|
||||
"code": 200,
|
||||
}
|
||||
while True:
|
||||
try:
|
||||
if os.path.exists(metapath):
|
||||
with open(metapath) as f:
|
||||
meta.update(json.load(f))
|
||||
url = (
|
||||
info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
|
||||
)
|
||||
|
||||
if revisions == None:
|
||||
meta['revisions'] = getjson(
|
||||
info['localapiurl']
|
||||
+ 'getRevisionsCount?'
|
||||
+ urlencode(data)
|
||||
)['data']['revisions']
|
||||
else:
|
||||
meta['revisions'] = revisions
|
||||
|
||||
if (meta['revisions'] == 0) and (not args.zerorevs):
|
||||
response = await agetjson(session, url)
|
||||
revisions = response['data']['revisions']
|
||||
if meta['revisions'] == revisions and not args.force:
|
||||
skip = True
|
||||
break
|
||||
|
||||
# todo: load more metadata!
|
||||
meta['group'], meta['pad'] = splitpadname(padid)
|
||||
meta['pathbase'] = p
|
||||
meta['lastedited_raw'] = int(
|
||||
getjson(
|
||||
info['localapiurl'] + 'getLastEdited?' + urlencode(data)
|
||||
)['data']['lastEdited']
|
||||
meta['padid'] = padid
|
||||
versions = meta["versions"] = []
|
||||
versions.append(
|
||||
{"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
|
||||
)
|
||||
|
||||
if revisions is None:
|
||||
url = (
|
||||
info['localapiurl'] + 'getRevisionsCount?' + urlencode(data)
|
||||
)
|
||||
meta['lastedited_iso'] = datetime.fromtimestamp(
|
||||
int(meta['lastedited_raw']) / 1000
|
||||
).isoformat()
|
||||
meta['author_ids'] = getjson(
|
||||
info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
|
||||
)['data']['authorIDs']
|
||||
response = await agetjson(session, url)
|
||||
meta['revisions'] = response['data']['revisions']
|
||||
else:
|
||||
meta['revisions'] = revisions
|
||||
|
||||
if (meta['revisions'] == 0) and (not args.zerorevs):
|
||||
skip = True
|
||||
break
|
||||
except HTTPError as e:
|
||||
tries += 1
|
||||
if tries > 3:
|
||||
print(
|
||||
"Too many failures ({0}), skipping".format(padid),
|
||||
file=sys.stderr,
|
||||
)
|
||||
skip = True
|
||||
break
|
||||
else:
|
||||
sleep(3)
|
||||
except TypeError as e:
|
||||
|
||||
# todo: load more metadata!
|
||||
meta['group'], meta['pad'] = splitpadname(padid)
|
||||
meta['pathbase'] = p
|
||||
|
||||
url = info['localapiurl'] + 'getLastEdited?' + urlencode(data)
|
||||
response = await agetjson(session, url)
|
||||
meta['lastedited_raw'] = int(response['data']['lastEdited'])
|
||||
|
||||
meta['lastedited_iso'] = datetime.fromtimestamp(
|
||||
int(meta['lastedited_raw']) / 1000
|
||||
).isoformat()
|
||||
|
||||
url = info['localapiurl'] + 'listAuthorsOfPad?' + urlencode(data)
|
||||
response = await agetjson(session, url)
|
||||
meta['author_ids'] = response['data']['authorIDs']
|
||||
|
||||
break
|
||||
except HTTPError as e:
|
||||
tries += 1
|
||||
if tries > 3:
|
||||
print(
|
||||
"Type Error loading pad {0} (phantom pad?), skipping".format(
|
||||
padid
|
||||
),
|
||||
"Too many failures ({0}), skipping".format(padid),
|
||||
file=sys.stderr,
|
||||
)
|
||||
skip = True
|
||||
break
|
||||
else:
|
||||
await trio.sleep(3)
|
||||
except TypeError as e:
|
||||
print(
|
||||
"Type Error loading pad {0} (phantom pad?), skipping".format(
|
||||
padid
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
skip = True
|
||||
break
|
||||
|
||||
if skip:
|
||||
continue
|
||||
if skip:
|
||||
return
|
||||
|
||||
count += 1
|
||||
if args.output:
|
||||
print(padid)
|
||||
|
||||
if args.output:
|
||||
print(padid)
|
||||
if args.all or (args.meta or args.text or args.html or args.dhtml):
|
||||
try:
|
||||
os.makedirs(os.path.split(metapath)[0])
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if args.all or (args.meta or args.text or args.html or args.dhtml):
|
||||
try:
|
||||
os.makedirs(os.path.split(metapath)[0])
|
||||
except OSError:
|
||||
pass
|
||||
if args.all or args.text:
|
||||
url = info['localapiurl'] + 'getText?' + urlencode(data)
|
||||
text = await agetjson(session, url)
|
||||
ver = {"type": "text"}
|
||||
versions.append(ver)
|
||||
ver["code"] = text["_code"]
|
||||
if text["_code"] == 200:
|
||||
text = text['data']['text']
|
||||
|
||||
if args.all or args.text:
|
||||
text = getjson(info['localapiurl'] + 'getText?' + urlencode(data))
|
||||
ver = {"type": "text"}
|
||||
versions.append(ver)
|
||||
ver["code"] = text["_code"]
|
||||
if text["_code"] == 200:
|
||||
text = text['data']['text']
|
||||
|
||||
##########################################
|
||||
## ENFORCE __NOPUBLISH__ MAGIC WORD
|
||||
##########################################
|
||||
if args.nopublish and args.nopublish in text:
|
||||
# NEED TO PURGE ANY EXISTING DOCS
|
||||
try_deleting(
|
||||
(
|
||||
p + raw_ext,
|
||||
p + ".raw.html",
|
||||
p + ".diff.html",
|
||||
p + ".meta.json",
|
||||
)
|
||||
##########################################
|
||||
## ENFORCE __NOPUBLISH__ MAGIC WORD
|
||||
##########################################
|
||||
if args.nopublish and args.nopublish in text:
|
||||
# NEED TO PURGE ANY EXISTING DOCS
|
||||
try_deleting(
|
||||
(
|
||||
p + raw_ext,
|
||||
p + ".raw.html",
|
||||
p + ".diff.html",
|
||||
p + ".meta.json",
|
||||
)
|
||||
continue
|
||||
)
|
||||
return
|
||||
|
||||
##########################################
|
||||
## ENFORCE __PUBLISH__ MAGIC WORD
|
||||
##########################################
|
||||
if args.publish_opt_in and args.publish not in text:
|
||||
try_deleting(
|
||||
(
|
||||
p + raw_ext,
|
||||
p + ".raw.html",
|
||||
p + ".diff.html",
|
||||
p + ".meta.json",
|
||||
)
|
||||
##########################################
|
||||
## ENFORCE __PUBLISH__ MAGIC WORD
|
||||
##########################################
|
||||
if args.publish_opt_in and args.publish not in text:
|
||||
try_deleting(
|
||||
(
|
||||
p + raw_ext,
|
||||
p + ".raw.html",
|
||||
p + ".diff.html",
|
||||
p + ".meta.json",
|
||||
)
|
||||
continue
|
||||
)
|
||||
return
|
||||
|
||||
ver["path"] = p + raw_ext
|
||||
ver["url"] = quote(ver["path"])
|
||||
with open(ver["path"], "w") as f:
|
||||
f.write(text)
|
||||
# once the content is settled, compute a hash
|
||||
# and link it in the metadata!
|
||||
ver["path"] = p + raw_ext
|
||||
ver["url"] = quote(ver["path"])
|
||||
with open(ver["path"], "w") as f:
|
||||
f.write(text)
|
||||
# once the content is settled, compute a hash
|
||||
# and link it in the metadata!
|
||||
|
||||
links = []
|
||||
if args.css:
|
||||
links.append({"href": args.css, "rel": "stylesheet"})
|
||||
# todo, make this process reflect which files actually were made
|
||||
versionbaseurl = quote(padid)
|
||||
links = []
|
||||
if args.css:
|
||||
links.append({"href": args.css, "rel": "stylesheet"})
|
||||
# todo, make this process reflect which files actually were made
|
||||
versionbaseurl = quote(padid)
|
||||
links.append(
|
||||
{
|
||||
"href": versions[0]["url"],
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "Etherpad",
|
||||
}
|
||||
)
|
||||
if args.all or args.text:
|
||||
links.append(
|
||||
{
|
||||
"href": versions[0]["url"],
|
||||
"href": versionbaseurl + raw_ext,
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "Etherpad",
|
||||
"type": "text/plain",
|
||||
"title": "Plain text",
|
||||
}
|
||||
)
|
||||
if args.all or args.html:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".raw.html",
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "HTML",
|
||||
}
|
||||
)
|
||||
if args.all or args.dhtml:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".diff.html",
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "HTML with author colors",
|
||||
}
|
||||
)
|
||||
if args.all or args.meta:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".meta.json",
|
||||
"rel": "alternate",
|
||||
"type": "application/json",
|
||||
"title": "Meta data",
|
||||
}
|
||||
)
|
||||
if args.all or args.text:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + raw_ext,
|
||||
"rel": "alternate",
|
||||
"type": "text/plain",
|
||||
"title": "Plain text",
|
||||
}
|
||||
)
|
||||
if args.all or args.html:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".raw.html",
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "HTML",
|
||||
}
|
||||
)
|
||||
if args.all or args.dhtml:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".diff.html",
|
||||
"rel": "alternate",
|
||||
"type": "text/html",
|
||||
"title": "HTML with author colors",
|
||||
}
|
||||
)
|
||||
if args.all or args.meta:
|
||||
links.append(
|
||||
{
|
||||
"href": versionbaseurl + ".meta.json",
|
||||
"rel": "alternate",
|
||||
"type": "application/json",
|
||||
"title": "Meta data",
|
||||
}
|
||||
)
|
||||
|
||||
# links.append({"href":"/", "rel":"search", "type":"text/html", "title":"Index"})
|
||||
|
||||
if args.all or args.dhtml:
|
||||
data['startRev'] = "0"
|
||||
html = getjson(
|
||||
info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
|
||||
)
|
||||
ver = {"type": "diffhtml"}
|
||||
versions.append(ver)
|
||||
ver["code"] = html["_code"]
|
||||
if html["_code"] == 200:
|
||||
try:
|
||||
html = html['data']['html']
|
||||
ver["path"] = p + ".diff.html"
|
||||
ver["url"] = quote(ver["path"])
|
||||
# doc = html5lib.parse(html, treebuilder="etree", override_encoding="utf-8", namespaceHTMLElements=False)
|
||||
doc = html5lib.parse(
|
||||
html, treebuilder="etree", namespaceHTMLElements=False
|
||||
)
|
||||
html5tidy(
|
||||
doc,
|
||||
indent=True,
|
||||
title=padid,
|
||||
scripts=args.script,
|
||||
links=links,
|
||||
)
|
||||
with open(ver["path"], "w") as f:
|
||||
print(
|
||||
ET.tostring(doc, method="html", encoding="unicode"),
|
||||
file=f,
|
||||
)
|
||||
except TypeError:
|
||||
# Malformed / incomplete response, record the message (such as "internal error") in the metadata and write NO file!
|
||||
ver["message"] = html["message"]
|
||||
# with open(ver["path"], "w") as f:
|
||||
# print ("""<pre>{0}</pre>""".format(json.dumps(html, indent=2)), file=f)
|
||||
|
||||
# Process text, html, dhtml, all options
|
||||
if args.all or args.html:
|
||||
html = getjson(info['localapiurl'] + 'getHTML?' + urlencode(data))
|
||||
ver = {"type": "html"}
|
||||
versions.append(ver)
|
||||
ver["code"] = html["_code"]
|
||||
if html["_code"] == 200:
|
||||
if args.all or args.dhtml:
|
||||
data['startRev'] = "0"
|
||||
url = info['localapiurl'] + 'createDiffHTML?' + urlencode(data)
|
||||
html = await agetjson(session, url)
|
||||
ver = {"type": "diffhtml"}
|
||||
versions.append(ver)
|
||||
ver["code"] = html["_code"]
|
||||
if html["_code"] == 200:
|
||||
try:
|
||||
html = html['data']['html']
|
||||
ver["path"] = p + ".raw.html"
|
||||
ver["path"] = p + ".diff.html"
|
||||
ver["url"] = quote(ver["path"])
|
||||
doc = html5lib.parse(
|
||||
html, treebuilder="etree", namespaceHTMLElements=False
|
||||
@ -456,12 +410,68 @@ def main(args):
|
||||
ET.tostring(doc, method="html", encoding="unicode"),
|
||||
file=f,
|
||||
)
|
||||
except TypeError:
|
||||
ver["message"] = html["message"]
|
||||
|
||||
# output meta
|
||||
if args.all or args.meta:
|
||||
ver = {"type": "meta"}
|
||||
versions.append(ver)
|
||||
ver["path"] = metapath
|
||||
ver["url"] = quote(metapath)
|
||||
with open(metapath, "w") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
# Process text, html, dhtml, all options
|
||||
if args.all or args.html:
|
||||
url = info['localapiurl'] + 'getHTML?' + urlencode(data)
|
||||
html = await agetjson(session, url)
|
||||
ver = {"type": "html"}
|
||||
versions.append(ver)
|
||||
ver["code"] = html["_code"]
|
||||
if html["_code"] == 200:
|
||||
html = html['data']['html']
|
||||
ver["path"] = p + ".raw.html"
|
||||
ver["url"] = quote(ver["path"])
|
||||
doc = html5lib.parse(
|
||||
html, treebuilder="etree", namespaceHTMLElements=False
|
||||
)
|
||||
html5tidy(
|
||||
doc, indent=True, title=padid, scripts=args.script, links=links,
|
||||
)
|
||||
with open(ver["path"], "w") as f:
|
||||
print(
|
||||
ET.tostring(doc, method="html", encoding="unicode"), file=f,
|
||||
)
|
||||
|
||||
# output meta
|
||||
if args.all or args.meta:
|
||||
ver = {"type": "meta"}
|
||||
versions.append(ver)
|
||||
ver["path"] = metapath
|
||||
ver["url"] = quote(metapath)
|
||||
with open(metapath, "w") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
|
||||
async def handle_pad_chunk(args, padids, data, info, session):
|
||||
progress_kwargs = {}
|
||||
if not istty():
|
||||
progress_kwargs.update(dict(disable=True))
|
||||
|
||||
padids = tqdm(iterable=padids, total=len(padids), **progress_kwargs,)
|
||||
for padid in padids:
|
||||
await handle_pad(args, padid, data, info, session)
|
||||
|
||||
|
||||
async def handle_pads(args):
|
||||
session = asks.Session(connections=args.connection)
|
||||
info = loadpadinfo(args.padinfo)
|
||||
data = {'apikey': info['apikey']}
|
||||
|
||||
padids = await get_padids(args, info, data, session)
|
||||
if args.skip:
|
||||
padids = padids[args.skip : len(padids)]
|
||||
CHUNK_SIZE = math.ceil(len(padids) / 3)
|
||||
|
||||
async with trio.open_nursery() as nursery:
|
||||
for padids in chunks(padids, CHUNK_SIZE):
|
||||
_args = (args, padids, data, info, session)
|
||||
nursery.start_soon(handle_pad_chunk, *_args)
|
||||
|
||||
|
||||
def main(args):
|
||||
p = build_argument_parser(args)
|
||||
args = p.parse_args(args)
|
||||
trio.run(handle_pads, args)
|
||||
|
7
setup.py
7
setup.py
@ -43,13 +43,16 @@ setup(
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
install_requires=[
|
||||
"asks",
|
||||
"html5lib",
|
||||
"jinja2",
|
||||
"python-dateutil",
|
||||
"pypandoc",
|
||||
"tqdm",
|
||||
"python-dateutil",
|
||||
"requests",
|
||||
"tqdm",
|
||||
"trio",
|
||||
],
|
||||
python_requires=">=3.5",
|
||||
classifiers=[
|
||||
'Programming Language :: Python :: 3',
|
||||
'Environment :: Console',
|
||||
|
Loading…
Reference in New Issue
Block a user