Browse Source

Publish new 0.1.18 version

main
manetta 3 years ago
parent
commit
ecc26c971d
  1. 2
      etherpump/__init__.py
  2. 943
      etherpump/commands/pull.py
  3. 2
      pyproject.toml

2
etherpump/__init__.py

@ -4,7 +4,7 @@ import sys
DATAPATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")
__VERSION__ = "0.0.17"
__VERSION__ = "0.0.18"
def subcommands():

943
etherpump/commands/pull.py

@ -20,8 +20,8 @@ from etherpump.commands.html5tidy import html5tidy
"""
pull(meta):
Update meta data files for those that have changed.
Check for changed pads by looking at revisions & comparing to existing
Update meta data files for those that have changed.
Check for changed pads by looking at revisions & comparing to existing
todo...
use/prefer public interfaces ? (export functions)
"""
@ -31,487 +31,498 @@ skipped, saved = 0, 0
async def try_deleting(files):
for f in files:
try:
path = trio.Path(f)
if os.path.exists(path):
await path.rmdir()
except Exception as exception:
print("PANIC: {}".format(exception))
for f in files:
try:
path = trio.Path(f)
if os.path.exists(path):
await path.rmdir()
except Exception as exception:
print("PANIC: {}".format(exception))
def build_argument_parser(args):
parser = ArgumentParser(
"Check for pads that have changed since last sync (according to .meta.json)"
)
parser.add_argument("padid", nargs="*", default=[])
parser.add_argument(
"--glob", default=False, help="download pads matching a glob pattern"
)
parser.add_argument(
"--padinfo",
default=".etherpump/settings.json",
help="settings, default: .etherpump/settings.json",
)
parser.add_argument(
"--zerorevs",
default=False,
action="store_true",
help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
)
parser.add_argument(
"--pub",
default="p",
help="folder to store files for public pads, default: p",
)
parser.add_argument(
"--group",
default="g",
help="folder to store files for group pads, default: g",
)
parser.add_argument(
"--skip",
default=None,
type=int,
help="skip this many items, default: None",
)
parser.add_argument(
"--connection",
default=200,
type=int,
help="number of connections to run concurrently",
)
parser.add_argument(
"--meta",
default=False,
action="store_true",
help="download meta to PADID.meta.json, default: False",
)
parser.add_argument(
"--text",
default=False,
action="store_true",
help="download text to PADID.txt, default: False",
)
parser.add_argument(
"--html",
default=False,
action="store_true",
help="download html to PADID.html, default: False",
)
parser.add_argument(
"--dhtml",
default=False,
action="store_true",
help="download dhtml to PADID.diff.html, default: False",
)
parser.add_argument(
"--all",
default=False,
action="store_true",
help="download all files (meta, text, html, dhtml), default: False",
)
parser.add_argument(
"--folder",
default=False,
action="store_true",
help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
)
parser.add_argument(
"--output",
default=False,
action="store_true",
help="output changed padids on stdout",
)
parser.add_argument(
"--force",
default=False,
action="store_true",
help="reload, even if revisions count matches previous",
)
parser.add_argument(
"--no-raw-ext",
default=False,
action="store_true",
help="save plain text as padname with no (additional) extension",
)
parser.add_argument(
"--fix-names",
default=False,
action="store_true",
help="normalize padid's (no spaces, special control chars) for use in file names",
)
parser.add_argument(
"--filter-ext", default=None, help="filter pads by extension"
)
parser.add_argument(
"--css",
default="/styles.css",
help="add css url to output pages, default: /styles.css",
)
parser.add_argument(
"--script",
default="/versions.js",
help="add script url to output pages, default: /versions.js",
)
parser.add_argument(
"--nopublish",
default="__NOPUBLISH__",
help="no publish magic word, default: __NOPUBLISH__",
)
parser.add_argument(
"--publish",
default="__PUBLISH__",
help="the publish magic word, default: __PUBLISH__",
)
parser.add_argument(
"--publish-opt-in",
default=False,
action="store_true",
help="ensure `--publish` is honoured instead of `--nopublish`",
)
return parser
parser = ArgumentParser(
"Check for pads that have changed since last sync (according to .meta.json)"
)
parser.add_argument("padid", nargs="*", default=[])
parser.add_argument(
"--glob", default=False, help="download pads matching a glob pattern"
)
parser.add_argument(
"--padinfo",
default=".etherpump/settings.json",
help="settings, default: .etherpump/settings.json",
)
parser.add_argument(
"--zerorevs",
default=False,
action="store_true",
help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
)
parser.add_argument(
"--pub",
default="p",
help="folder to store files for public pads, default: p",
)
parser.add_argument(
"--group",
default="g",
help="folder to store files for group pads, default: g",
)
parser.add_argument(
"--skip",
default=None,
type=int,
help="skip this many items, default: None",
)
parser.add_argument(
"--connection",
default=200,
type=int,
help="number of connections to run concurrently",
)
parser.add_argument(
"--meta",
default=False,
action="store_true",
help="download meta to PADID.meta.json, default: False",
)
parser.add_argument(
"--text",
default=False,
action="store_true",
help="download text to PADID.txt, default: False",
)
parser.add_argument(
"--html",
default=False,
action="store_true",
help="download html to PADID.html, default: False",
)
parser.add_argument(
"--dhtml",
default=False,
action="store_true",
help="download dhtml to PADID.diff.html, default: False",
)
parser.add_argument(
"--all",
default=False,
action="store_true",
help="download all files (meta, text, html, dhtml), default: False",
)
parser.add_argument(
"--folder",
default=False,
action="store_true",
help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
)
parser.add_argument(
"--output",
default=False,
action="store_true",
help="output changed padids on stdout",
)
parser.add_argument(
"--force",
default=False,
action="store_true",
help="reload, even if revisions count matches previous",
)
parser.add_argument(
"--no-raw-ext",
default=False,
action="store_true",
help="save plain text as padname with no (additional) extension",
)
parser.add_argument(
"--fix-names",
default=False,
action="store_true",
help="normalize padid's (no spaces, special control chars) for use in file names",
)
parser.add_argument(
"--filter-ext", default=None, help="filter pads by extension"
)
parser.add_argument(
"--css",
default="/styles.css",
help="add css url to output pages, default: /styles.css",
)
parser.add_argument(
"--script",
default="/versions.js",
help="add script url to output pages, default: /versions.js",
)
parser.add_argument(
"--nopublish",
default="__NOPUBLISH__",
help="no publish magic word, default: __NOPUBLISH__",
)
parser.add_argument(
"--publish",
default="__PUBLISH__",
help="the publish magic word, default: __PUBLISH__",
)
parser.add_argument(
"--publish-opt-in",
default=False,
action="store_true",
help="ensure `--publish` is honoured instead of `--nopublish`",
)
return parser
async def get_padids(args, info, data, session):
if args.padid:
padids = args.padid
elif args.glob:
url = info["localapiurl"] + "listAllPads?" + urlencode(data)
padids = await agetjson(session, url)
padids = padids["data"]["padIDs"]
padids = [x for x in padids if fnmatch(x, args.glob)]
else:
url = info["localapiurl"] + "listAllPads?" + urlencode(data)
padids = await agetjson(session, url)
padids = padids["data"]["padIDs"]
padids.sort()
return padids
if args.padid:
padids = args.padid
elif args.glob:
url = info["localapiurl"] + "listAllPads?" + urlencode(data)
padids = await agetjson(session, url)
padids = padids["data"]["padIDs"]
padids = [x for x in padids if fnmatch(x, args.glob)]
else:
url = info["localapiurl"] + "listAllPads?" + urlencode(data)
padids = await agetjson(session, url)
padids = padids["data"]["padIDs"]
padids.sort()
return padids
async def handle_pad(args, padid, data, info, session):
global skipped, saved
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
data["padID"] = padid
p = padpath(padid, args.pub, args.group, args.fix_names)
if args.folder:
p = os.path.join(p, padid)
metapath = p + ".meta.json"
revisions = None
tries = 1
skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
while True:
try:
if os.path.exists(metapath):
async with await trio.open_file(metapath) as f:
contents = await f.read()
meta.update(json.loads(contents))
url = (
info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
)
response = await agetjson(session, url)
revisions = response["data"]["revisions"]
if meta["revisions"] == revisions and not args.force:
skip = True
reason = "No new revisions, we already have the latest local copy"
break
meta["padid"] = padid
versions = meta["versions"] = []
versions.append(
{"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
)
if revisions is None:
url = (
info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
)
response = await agetjson(session, url)
meta["revisions"] = response["data"]["revisions"]
else:
meta["revisions"] = revisions
if (meta["revisions"] == 0) and (not args.zerorevs):
skip = True
reason = "0 revisions, this pad was never edited"
break
# todo: load more metadata!
meta["group"], meta["pad"] = splitpadname(padid)
meta["pathbase"] = p
url = info["localapiurl"] + "getLastEdited?" + urlencode(data)
response = await agetjson(session, url)
meta["lastedited_raw"] = int(response["data"]["lastEdited"])
meta["lastedited_iso"] = datetime.fromtimestamp(
int(meta["lastedited_raw"]) / 1000
).isoformat()
url = info["localapiurl"] + "listAuthorsOfPad?" + urlencode(data)
response = await agetjson(session, url)
meta["author_ids"] = response["data"]["authorIDs"]
break
except HTTPError as e:
tries += 1
if tries > 3:
print(
"Too many failures ({0}), skipping".format(padid),
file=sys.stderr,
)
skip = True
reason = "PANIC, couldn't download the pad contents"
break
else:
await trio.sleep(1)
except TypeError as e:
print(
"Type Error loading pad {0} (phantom pad?), skipping".format(
padid
),
file=sys.stderr,
)
skip = True
reason = "PANIC, couldn't download the pad contents"
break
# Note(decentral1se): cannot track this bug down but basically the `data`
# and `padid` are getting out of sync and it is ending up that the same pad
# over and over again is downloaded. This resets things in a way that it
# works. This is a hack and one day TM I will find out how to fix it proper
data["padID"] = padid
if skip:
print("[ ] {} (skipped, reason: {})".format(padid, reason))
skipped += 1
return
if args.output:
print(padid)
if args.all or (args.meta or args.text or args.html or args.dhtml):
try:
path = trio.Path(os.path.split(metapath)[0])
if not os.path.exists(path):
await path.mkdir()
except OSError:
# Note(decentral1se): the path already exists
pass
if args.all or args.text:
url = info["localapiurl"] + "getText?" + urlencode(data)
text = await agetjson(session, url)
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text["data"]["text"]
##########################################
## ENFORCE __NOPUBLISH__ MAGIC WORD
##########################################
if args.nopublish in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print(
"[ ] {} (deleted, reason: explicit __NOPUBLISH__)".format(
padid
)
)
skipped += 1
return False
##########################################
## ENFORCE __PUBLISH__ MAGIC WORD
##########################################
if args.publish_opt_in and args.publish not in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
skipped += 1
return False
ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"])
async with await trio.open_file(ver["path"], "w") as f:
try:
# Note(decentral1se): unicode handling...
safe_text = text.encode("utf-8", "replace").decode()
await f.write(safe_text)
except Exception as exception:
print("PANIC: {}".format(exception))
# once the content is settled, compute a hash
# and link it in the metadata!
links = []
if args.css:
links.append({"href": args.css, "rel": "stylesheet"})
# todo, make this process reflect which files actually were made
versionbaseurl = quote(padid)
links.append(
{
"href": versions[0]["url"],
"rel": "alternate",
"type": "text/html",
"title": "Etherpad",
}
)
if args.all or args.text:
links.append(
{
"href": versionbaseurl + raw_ext,
"rel": "alternate",
"type": "text/plain",
"title": "Plain text",
}
)
if args.all or args.html:
links.append(
{
"href": versionbaseurl + ".raw.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML",
}
)
if args.all or args.dhtml:
links.append(
{
"href": versionbaseurl + ".diff.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML with author colors",
}
)
if args.all or args.meta:
links.append(
{
"href": versionbaseurl + ".meta.json",
"rel": "alternate",
"type": "application/json",
"title": "Meta data",
}
)
if args.all or args.dhtml:
data["startRev"] = "0"
url = info["localapiurl"] + "createDiffHTML?" + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html["data"]["html"]
ver["path"] = p + ".diff.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc,
indent=True,
title=padid,
scripts=args.script,
links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
except TypeError:
ver["message"] = html["message"]
# Process text, html, dhtml, all options
if args.all or args.html:
url = info["localapiurl"] + "getHTML?" + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "html"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
html = html["data"]["html"]
ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
# output meta
if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
async with await trio.open_file(metapath, "w") as f:
await f.write(json.dumps(meta))
print("[x] {} (saved)".format(padid))
saved += 1
return
global skipped, saved
raw_ext = ".raw.txt"
if args.no_raw_ext:
raw_ext = ""
data["padID"] = padid
p = padpath(padid, args.pub, args.group, args.fix_names)
if args.folder:
p = os.path.join(p, padid)
metapath = p + ".meta.json"
revisions = None
tries = 1
skip = False
padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
meta = {}
while True:
try:
if os.path.exists(metapath):
async with await trio.open_file(metapath) as f:
contents = await f.read()
meta.update(json.loads(contents))
url = (
info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
)
response = await agetjson(session, url)
revisions = response["data"]["revisions"]
if meta["revisions"] == revisions and not args.force:
skip = True
reason = "No new revisions, we already have the latest local copy"
break
meta["padid"] = padid
versions = meta["versions"] = []
versions.append(
{"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
)
if revisions is None:
url = (
info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
)
response = await agetjson(session, url)
meta["revisions"] = response["data"]["revisions"]
else:
meta["revisions"] = revisions
if (meta["revisions"] == 0) and (not args.zerorevs):
skip = True
reason = "0 revisions, this pad was never edited"
break
# todo: load more metadata!
meta["group"], meta["pad"] = splitpadname(padid)
meta["pathbase"] = p
url = info["localapiurl"] + "getLastEdited?" + urlencode(data)
response = await agetjson(session, url)
meta["lastedited_raw"] = int(response["data"]["lastEdited"])
meta["lastedited_iso"] = datetime.fromtimestamp(
int(meta["lastedited_raw"]) / 1000
).isoformat()
url = info["localapiurl"] + "listAuthorsOfPad?" + urlencode(data)
response = await agetjson(session, url)
meta["author_ids"] = response["data"]["authorIDs"]
break
except HTTPError as e:
tries += 1
if tries > 3:
print(
"Too many failures ({0}), skipping".format(padid),
file=sys.stderr,
)
skip = True
reason = "PANIC, couldn't download the pad contents"
break
else:
await trio.sleep(1)
except TypeError as e:
print(
"Type Error loading pad {0} (phantom pad?), skipping".format(
padid
),
file=sys.stderr,
)
skip = True
reason = "PANIC, couldn't download the pad contents"
break
# Note(decentral1se): cannot track this bug down but basically the `data`
# and `padid` are getting out of sync and it is ending up that the same pad
# over and over again is downloaded. This resets things in a way that it
# works. This is a hack and one day TM I will find out how to fix it proper
data["padID"] = padid
if skip:
print("[ ] {} (skipped, reason: {})".format(padid, reason))
skipped += 1
return
if args.output:
print(padid)
if args.all or (args.meta or args.text or args.html or args.dhtml):
try:
path = trio.Path(os.path.split(metapath)[0])
if not os.path.exists(path):
await path.mkdir()
except OSError:
# Note(decentral1se): the path already exists
pass
if args.all or args.text:
url = info["localapiurl"] + "getText?" + urlencode(data)
text = await agetjson(session, url)
ver = {"type": "text"}
versions.append(ver)
ver["code"] = text["_code"]
if text["_code"] == 200:
text = text["data"]["text"]
##########################################
## ENFORCE __NOPUBLISH__ MAGIC WORD
##########################################
if args.nopublish in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print(
"[ ] {} (deleted, reason: explicit __NOPUBLISH__)".format(
padid
)
)
skipped += 1
return False
##########################################
## ENFORCE __PUBLISH__ MAGIC WORD
##########################################
if args.publish_opt_in and args.publish not in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
skipped += 1
return False
ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"])
async with await trio.open_file(ver["path"], "w") as f:
try:
# Note(decentral1se): unicode handling...
safe_text = text.encode("utf-8", "replace").decode()
await f.write(safe_text)
except Exception as exception:
print("PANIC: {}".format(exception))
# once the content is settled, compute a hash
# and link it in the metadata!
links = []
if args.css:
links.append({"href": args.css, "rel": "stylesheet"})
# todo, make this process reflect which files actually were made
versionbaseurl = quote(padid)
links.append(
{
"href": versions[0]["url"],
"rel": "alternate",
"type": "text/html",
"title": "Etherpad",
}
)
if args.all or args.text:
links.append(
{
"href": versionbaseurl + raw_ext,
"rel": "alternate",
"type": "text/plain",
"title": "Plain text",
}
)
if args.all or args.html:
links.append(
{
"href": versionbaseurl + ".raw.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML",
}
)
if args.all or args.dhtml:
links.append(
{
"href": versionbaseurl + ".diff.html",
"rel": "alternate",
"type": "text/html",
"title": "HTML with author colors",
}
)
if args.all or args.meta:
links.append(
{
"href": versionbaseurl + ".meta.json",
"rel": "alternate",
"type": "application/json",
"title": "Meta data",
}
)
if args.all or args.dhtml:
data["startRev"] = "0"
url = info["localapiurl"] + "createDiffHTML?" + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "diffhtml"}
versions.append(ver)
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html["data"]["html"]
ver["path"] = p + ".diff.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc,
indent=True,
title=padid,
scripts=args.script,
links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
except TypeError:
ver["message"] = html["message"]
# Process text, html, dhtml, all options
if args.all or args.html:
# mb: line causing the error of not writing the correct HTML content to the correct HTML file:
# url = info["localapiurl"] + "getHTML?" + urlencode(data)
# mb: warning, HACK! Catching the error by writing the API request url manually ...
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] + '&startRev=0'
# print(url)
html = await agetjson(session, url)
ver = {"type": "html"}
versions.append(ver)
# mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
try:
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html["data"]["html"]
ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
except TypeError:
ver["message"] = html["message"]
except Exception as exception:
print("PANIC: {}".format(exception))
# output meta
if args.all or args.meta:
ver = {"type": "meta"}
versions.append(ver)
ver["path"] = metapath
ver["url"] = quote(metapath)
async with await trio.open_file(metapath, "w") as f:
await f.write(json.dumps(meta))
print("[x] {} (saved)".format(padid))
saved += 1
return
async def handle_pads(args):
global skipped, saved
global skipped, saved
session = asks.Session(connections=args.connection)
info = loadpadinfo(args.padinfo)
data = {"apikey": info["apikey"]}
session = asks.Session(connections=args.connection)
info = loadpadinfo(args.padinfo)
data = {"apikey": info["apikey"]}
padids = await get_padids(args, info, data, session)
if args.skip:
padids = padids[args.skip : len(padids)]
padids = await get_padids(args, info, data, session)
if args.skip:
padids = padids[args.skip : len(padids)]
print("=" * 79)
print("Etherpump is warming up the engines ...")
print("=" * 79)
print("=" * 79)
print("Etherpump is warming up the engines ...")
print("=" * 79)
start = time.time()
async with trio.open_nursery() as nursery:
for padid in padids:
nursery.start_soon(handle_pad, args, padid, data, info, session)
end = time.time()
timeit = round(end - start, 2)
start = time.time()
async with trio.open_nursery() as nursery:
for padid in padids:
nursery.start_soon(handle_pad, args, padid, data, info, session)
end = time.time()
timeit = round(end - start, 2)
print("=" * 79)
print(
"Processed {} :: Skipped {} :: Saved {} :: Time {}s".format(
len(padids), skipped, saved, timeit
)
)
print("=" * 79)
print("=" * 79)
print(
"Processed {} :: Skipped {} :: Saved {} :: Time {}s".format(
len(padids), skipped, saved, timeit
)
)
print("=" * 79)
def main(args):
p = build_argument_parser(args)
args = p.parse_args(args)
trio.run(handle_pads, args)
p = build_argument_parser(args)
args = p.parse_args(args)
trio.run(handle_pads, args)

2
pyproject.toml

@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api"
[tool.poetry]
name = "etherpump"
version = "0.0.17"
version = "0.0.18"
description = "Pumping text from etherpads into publications"
authors = ["Varia, Center for Everyday Technology"]
maintainers = ["Varia, Center for Everyday Technology <info@varia.zone>"]

Loading…
Cancel
Save