diff --git a/etherpump/api/__init__.py b/etherpump/api/__init__.py
index 8bcca11..262d0fe 100644
--- a/etherpump/api/__init__.py
+++ b/etherpump/api/__init__.py
@@ -11,7 +11,7 @@ from etherpump.commands.gethtml import main as gethtml # noqa
from etherpump.commands.gettext import main as gettext # noqa
from etherpump.commands.index import main as index # noqa
from etherpump.commands.init import main # noqa
-from etherpump.commands.init import main as init # noqa
+from etherpump.commands.init import main as init
from etherpump.commands.list import main as list # noqa
from etherpump.commands.listauthors import main as listauthors # noqa
from etherpump.commands.publication import main as publication # noqa
diff --git a/etherpump/commands/pull.py b/etherpump/commands/pull.py
index d1e571f..adbd32a 100644
--- a/etherpump/commands/pull.py
+++ b/etherpump/commands/pull.py
@@ -31,549 +31,578 @@ skipped, saved = 0, 0
async def try_deleting(files):
- for f in files:
- try:
- path = trio.Path(f)
- if os.path.exists(path):
- await path.rmdir()
- except Exception as exception:
- print("PANIC: {}".format(exception))
+ for f in files:
+ try:
+ path = trio.Path(f)
+ if os.path.exists(path):
+ await path.rmdir()
+ except Exception as exception:
+ print("PANIC: {}".format(exception))
def build_argument_parser(args):
- parser = ArgumentParser(
- "Check for pads that have changed since last sync (according to .meta.json)"
- )
- parser.add_argument("padid", nargs="*", default=[])
- parser.add_argument(
- "--glob", default=False, help="download pads matching a glob pattern"
- )
- parser.add_argument(
- "--padinfo",
- default=".etherpump/settings.json",
- help="settings, default: .etherpump/settings.json",
- )
- parser.add_argument(
- "--zerorevs",
- default=False,
- action="store_true",
- help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
- )
- parser.add_argument(
- "--pub",
- default="p",
- help="folder to store files for public pads, default: p",
- )
- parser.add_argument(
- "--group",
- default="g",
- help="folder to store files for group pads, default: g",
- )
- parser.add_argument(
- "--skip",
- default=None,
- type=int,
- help="skip this many items, default: None",
- )
- parser.add_argument(
- "--connection",
- default=200,
- type=int,
- help="number of connections to run concurrently",
- )
- parser.add_argument(
- "--meta",
- default=False,
- action="store_true",
- help="download meta to PADID.meta.json, default: False",
- )
- parser.add_argument(
- "--text",
- default=False,
- action="store_true",
- help="download text to PADID.txt, default: False",
- )
- parser.add_argument(
- "--html",
- default=False,
- action="store_true",
- help="download html to PADID.html, default: False",
- )
- parser.add_argument(
- "--dhtml",
- default=False,
- action="store_true",
- help="download dhtml to PADID.diff.html, default: False",
- )
- parser.add_argument(
- "--all",
- default=False,
- action="store_true",
- help="download all files (meta, text, html, dhtml), default: False",
- )
- parser.add_argument(
- "--folder",
- default=False,
- action="store_true",
- help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
- )
- parser.add_argument(
- "--output",
- default=False,
- action="store_true",
- help="output changed padids on stdout",
- )
- parser.add_argument(
- "--force",
- default=False,
- action="store_true",
- help="reload, even if revisions count matches previous",
- )
- parser.add_argument(
- "--no-raw-ext",
- default=False,
- action="store_true",
- help="save plain text as padname with no (additional) extension",
- )
- parser.add_argument(
- "--fix-names",
- default=False,
- action="store_true",
- help="normalize padid's (no spaces, special control chars) for use in file names",
- )
- parser.add_argument(
- "--filter-ext", default=None, help="filter pads by extension"
- )
- parser.add_argument(
- "--css",
- default="/styles.css",
- help="add css url to output pages, default: /styles.css",
- )
- parser.add_argument(
- "--script",
- default="/versions.js",
- help="add script url to output pages, default: /versions.js",
- )
- parser.add_argument(
- "--nopublish",
- default="__NOPUBLISH__",
- help="no publish magic word, default: __NOPUBLISH__",
- )
- parser.add_argument(
- "--publish",
- default="__PUBLISH__",
- help="the publish magic word, default: __PUBLISH__",
- )
- parser.add_argument(
- "--publish-opt-in",
- default=False,
- action="store_true",
- help="ensure `--publish` is honoured instead of `--nopublish`",
- )
- parser.add_argument(
- "--magicwords",
- default=False,
- action="store_true",
- help="store all magic words used in a page in the meta.json file",
- )
- return parser
+ parser = ArgumentParser(
+ "Check for pads that have changed since last sync (according to .meta.json)"
+ )
+ parser.add_argument("padid", nargs="*", default=[])
+ parser.add_argument(
+ "--glob", default=False, help="download pads matching a glob pattern"
+ )
+ parser.add_argument(
+ "--padinfo",
+ default=".etherpump/settings.json",
+ help="settings, default: .etherpump/settings.json",
+ )
+ parser.add_argument(
+ "--zerorevs",
+ default=False,
+ action="store_true",
+ help="include pads with zero revisions, default: False (i.e. pads with no revisions are skipped)",
+ )
+ parser.add_argument(
+ "--pub",
+ default="p",
+ help="folder to store files for public pads, default: p",
+ )
+ parser.add_argument(
+ "--group",
+ default="g",
+ help="folder to store files for group pads, default: g",
+ )
+ parser.add_argument(
+ "--skip",
+ default=None,
+ type=int,
+ help="skip this many items, default: None",
+ )
+ parser.add_argument(
+ "--connection",
+ default=200,
+ type=int,
+ help="number of connections to run concurrently",
+ )
+ parser.add_argument(
+ "--meta",
+ default=False,
+ action="store_true",
+ help="download meta to PADID.meta.json, default: False",
+ )
+ parser.add_argument(
+ "--text",
+ default=False,
+ action="store_true",
+ help="download text to PADID.txt, default: False",
+ )
+ parser.add_argument(
+ "--html",
+ default=False,
+ action="store_true",
+ help="download html to PADID.html, default: False",
+ )
+ parser.add_argument(
+ "--dhtml",
+ default=False,
+ action="store_true",
+ help="download dhtml to PADID.diff.html, default: False",
+ )
+ parser.add_argument(
+ "--all",
+ default=False,
+ action="store_true",
+ help="download all files (meta, text, html, dhtml), default: False",
+ )
+ parser.add_argument(
+ "--folder",
+ default=False,
+ action="store_true",
+ help="dump files in a folder named PADID (meta, text, html, dhtml), default: False",
+ )
+ parser.add_argument(
+ "--output",
+ default=False,
+ action="store_true",
+ help="output changed padids on stdout",
+ )
+ parser.add_argument(
+ "--force",
+ default=False,
+ action="store_true",
+ help="reload, even if revisions count matches previous",
+ )
+ parser.add_argument(
+ "--no-raw-ext",
+ default=False,
+ action="store_true",
+ help="save plain text as padname with no (additional) extension",
+ )
+ parser.add_argument(
+ "--fix-names",
+ default=False,
+ action="store_true",
+ help="normalize padid's (no spaces, special control chars) for use in file names",
+ )
+ parser.add_argument(
+ "--filter-ext", default=None, help="filter pads by extension"
+ )
+ parser.add_argument(
+ "--css",
+ default="/styles.css",
+ help="add css url to output pages, default: /styles.css",
+ )
+ parser.add_argument(
+ "--script",
+ default="/versions.js",
+ help="add script url to output pages, default: /versions.js",
+ )
+ parser.add_argument(
+ "--nopublish",
+ default="__NOPUBLISH__",
+ help="no publish magic word, default: __NOPUBLISH__",
+ )
+ parser.add_argument(
+ "--publish",
+ default="__PUBLISH__",
+ help="the publish magic word, default: __PUBLISH__",
+ )
+ parser.add_argument(
+ "--publish-opt-in",
+ default=False,
+ action="store_true",
+ help="ensure `--publish` is honoured instead of `--nopublish`",
+ )
+ parser.add_argument(
+ "--magicwords",
+ default=False,
+ action="store_true",
+ help="store all magic words used in a page in the meta.json file",
+ )
+ return parser
async def get_padids(args, info, data, session):
- if args.padid:
- padids = args.padid
- elif args.glob:
- url = info["localapiurl"] + "listAllPads?" + urlencode(data)
- padids = await agetjson(session, url)
- padids = padids["data"]["padIDs"]
- padids = [x for x in padids if fnmatch(x, args.glob)]
- else:
- url = info["localapiurl"] + "listAllPads?" + urlencode(data)
- padids = await agetjson(session, url)
- padids = padids["data"]["padIDs"]
-
- padids.sort()
- return padids
+ if args.padid:
+ padids = args.padid
+ elif args.glob:
+ url = info["localapiurl"] + "listAllPads?" + urlencode(data)
+ padids = await agetjson(session, url)
+ padids = padids["data"]["padIDs"]
+ padids = [x for x in padids if fnmatch(x, args.glob)]
+ else:
+ url = info["localapiurl"] + "listAllPads?" + urlencode(data)
+ padids = await agetjson(session, url)
+ padids = padids["data"]["padIDs"]
+
+ padids.sort()
+ return padids
async def handle_pad(args, padid, data, info, session):
- global skipped, saved
-
- raw_ext = ".raw.txt"
- if args.no_raw_ext:
- raw_ext = ""
-
- data["padID"] = padid
- p = padpath(padid, args.pub, args.group, args.fix_names)
- if args.folder:
- p = os.path.join(p, padid)
-
- metapath = p + ".meta.json"
- revisions = None
- tries = 1
- skip = False
- padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
- meta = {}
-
- while True:
- try:
- if os.path.exists(metapath):
- async with await trio.open_file(metapath) as f:
- contents = await f.read()
- meta.update(json.loads(contents))
- url = (
- info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
- )
- response = await agetjson(session, url)
- revisions = response["data"]["revisions"]
- if meta["revisions"] == revisions and not args.force:
- skip = True
- reason = "No new revisions, we already have the latest local copy"
- break
-
- meta["padid"] = padid
- versions = meta["versions"] = []
- versions.append(
- {"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
- )
-
- if revisions is None:
- url = (
- info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
- )
- response = await agetjson(session, url)
- meta["revisions"] = response["data"]["revisions"]
- else:
- meta["revisions"] = revisions
-
- if (meta["revisions"] == 0) and (not args.zerorevs):
- skip = True
- reason = "0 revisions, this pad was never edited"
- break
-
- # todo: load more metadata!
- meta["group"], meta["pad"] = splitpadname(padid)
- meta["pathbase"] = p
-
- url = info["localapiurl"] + "getLastEdited?" + urlencode(data)
- response = await agetjson(session, url)
- meta["lastedited_raw"] = int(response["data"]["lastEdited"])
-
- meta["lastedited_iso"] = datetime.fromtimestamp(
- int(meta["lastedited_raw"]) / 1000
- ).isoformat()
-
- url = info["localapiurl"] + "listAuthorsOfPad?" + urlencode(data)
- response = await agetjson(session, url)
- meta["author_ids"] = response["data"]["authorIDs"]
-
- break
- except HTTPError as e:
- tries += 1
- if tries > 3:
- print(
- "Too many failures ({0}), skipping".format(padid),
- file=sys.stderr,
- )
- skip = True
- reason = "PANIC, couldn't download the pad contents"
- break
- else:
- await trio.sleep(1)
- except TypeError as e:
- print(
- "Type Error loading pad {0} (phantom pad?), skipping".format(
- padid
- ),
- file=sys.stderr,
- )
- skip = True
- reason = "PANIC, couldn't download the pad contents"
- break
-
- # Note(decentral1se): cannot track this bug down but basically the `data`
- # and `padid` are getting out of sync and it is ending up that the same pad
- # over and over again is downloaded. This resets things in a way that it
- # works. This is a hack and one day TM I will find out how to fix it proper
- data["padID"] = padid
-
- if skip:
- print("[ ] {} (skipped, reason: {})".format(padid, reason))
- skipped += 1
- return
-
- if args.output:
- print(padid)
-
- if args.all or (args.meta or args.text or args.html or args.dhtml):
- try:
- path = trio.Path(os.path.split(metapath)[0])
- if not os.path.exists(path):
- await path.mkdir()
- except OSError:
- # Note(decentral1se): the path already exists
- pass
-
- if args.all or args.text:
- url = info["localapiurl"] + "getText?" + urlencode(data)
- text = await agetjson(session, url)
- ver = {"type": "text"}
- versions.append(ver)
- ver["code"] = text["_code"]
-
- if text["_code"] == 200:
- text = text["data"]["text"]
-
- ##########################################
- ## ENFORCE __NOPUBLISH__ MAGIC WORD
- ##########################################
- if args.nopublish in text:
- await try_deleting(
- (
- p + raw_ext,
- p + ".raw.html",
- p + ".diff.html",
- p + ".meta.json",
- )
- )
- print(
- "[ ] {} (deleted, reason: explicit __NOPUBLISH__)".format(
- padid
- )
- )
- skipped += 1
- return False
-
- ##########################################
- ## ENFORCE __PUBLISH__ MAGIC WORD
- ##########################################
- if args.publish_opt_in and args.publish not in text:
- await try_deleting(
- (
- p + raw_ext,
- p + ".raw.html",
- p + ".diff.html",
- p + ".meta.json",
- )
- )
- print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
- skipped += 1
- return False
-
- ver["path"] = p + raw_ext
- ver["url"] = quote(ver["path"])
- async with await trio.open_file(ver["path"], "w") as f:
- try:
- # Note(decentral1se): unicode handling...
- safe_text = text.encode("utf-8", "replace").decode()
- await f.write(safe_text)
- except Exception as exception:
- print("PANIC: {}".format(exception))
-
- # once the content is settled, compute a hash
- # and link it in the metadata!
-
-
- # include magic words
- if args.magicwords:
-
- ##########################################
- ## INCLUDE __XXX__ MAGIC WORDS
- ##########################################
- pattern = r'__[a-zA-Z0-9]+?__'
- magic_words = re.findall(pattern, text)
- magic_words = list(set(magic_words))
- if magic_words:
- meta["magicwords"] = magic_words
- print('FOUND MAGIC WORD(s): {} in {}'.format(magic_words, padid))
-
- links = []
- if args.css:
- links.append({"href": args.css, "rel": "stylesheet"})
- # todo, make this process reflect which files actually were made
- versionbaseurl = quote(padid)
- links.append(
- {
- "href": versions[0]["url"],
- "rel": "alternate",
- "type": "text/html",
- "title": "Etherpad",
- }
- )
- if args.all or args.text:
- links.append(
- {
- "href": versionbaseurl + raw_ext,
- "rel": "alternate",
- "type": "text/plain",
- "title": "Plain text",
- }
- )
- if args.all or args.html:
- links.append(
- {
- "href": versionbaseurl + ".raw.html",
- "rel": "alternate",
- "type": "text/html",
- "title": "HTML",
- }
- )
- if args.all or args.dhtml:
- links.append(
- {
- "href": versionbaseurl + ".diff.html",
- "rel": "alternate",
- "type": "text/html",
- "title": "HTML with author colors",
- }
- )
- if args.all or args.meta:
- links.append(
- {
- "href": versionbaseurl + ".meta.json",
- "rel": "alternate",
- "type": "application/json",
- "title": "Meta data",
- }
- )
-
- if args.all or args.dhtml:
- data["startRev"] = "0"
- url = info["localapiurl"] + "createDiffHTML?" + urlencode(data)
- html = await agetjson(session, url)
- ver = {"type": "diffhtml"}
- versions.append(ver)
- ver["code"] = html["_code"]
- if html["_code"] == 200:
- try:
- html = html["data"]["html"]
- ver["path"] = p + ".diff.html"
- ver["url"] = quote(ver["path"])
- doc = html5lib.parse(
- html, treebuilder="etree", namespaceHTMLElements=False
- )
- html5tidy(
- doc,
- indent=True,
- title=padid,
- scripts=args.script,
- links=links,
- )
- async with await trio.open_file(ver["path"], "w") as f:
- output = ET.tostring(doc, method="html", encoding="unicode")
- await f.write(output)
- except TypeError:
- ver["message"] = html["message"]
-
- # Process text, html, dhtml, all options
- if args.all or args.html:
- # mb: line causing the error of not writing the correct HTML content to the correct HTML file:
- # url = info["localapiurl"] + "getHTML?" + urlencode(data)
- # mb: warning, HACK! Catching the error by writing the API request url manually ...
- url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"]
- # print(url)
- html = await agetjson(session, url)
- ver = {"type": "html"}
- versions.append(ver)
- # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
- # try:
- ver["code"] = html["_code"]
- if html["_code"] == 200:
- try:
- html = html["data"]["html"]
- ver["path"] = p + ".raw.html"
- ver["url"] = quote(ver["path"])
- doc = html5lib.parse(
- html, treebuilder="etree", namespaceHTMLElements=False
- )
- html5tidy(
- doc, indent=True, title=padid, scripts=args.script, links=links,
- )
- async with await trio.open_file(ver["path"], "w") as f:
- output = ET.tostring(doc, method="html", encoding="unicode")
- await f.write(output)
- except TypeError:
- ver["message"] = html["message"]
- # except Exception as exception:
- # print("PANIC: {}".format(exception))
-
- if args.all or args.magicwords:
- url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"]
- # print(url)
- html = await agetjson(session, url)
- ver = {"type": "magicwords"}
- versions.append(ver)
- # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
- # try:
- ver["code"] = html["_code"]
- if html["_code"] == 200:
- try:
- html = html["data"]["html"]
- ver["path"] = p + ".magicwords.html"
- ver["url"] = quote(ver["path"])
- for magic_word in magic_words:
- replace_word = ""+magic_word+""
- if magic_word in html:
- html = html.replace(magic_word,replace_word)
- doc = html5lib.parse(
- html, treebuilder="etree", namespaceHTMLElements=False
- )
- # INSERT MAGIC WORDS HIGHLIGHTING STUFF HERE!!!
- html5tidy(
- doc, indent=True, title=padid, scripts=args.script, links=links,
- )
- async with await trio.open_file(ver["path"], "w") as f:
- output = ET.tostring(doc, method="html", encoding="unicode")
- await f.write(output)
- except TypeError:
- ver["message"] = html["message"]
-
- # output meta
- if args.all or args.meta:
- ver = {"type": "meta"}
- versions.append(ver)
- ver["path"] = metapath
- ver["url"] = quote(metapath)
- async with await trio.open_file(metapath, "w") as f:
- await f.write(json.dumps(meta))
-
- print("[x] {} (saved)".format(padid))
- saved += 1
- return
+ global skipped, saved
+
+ raw_ext = ".raw.txt"
+ if args.no_raw_ext:
+ raw_ext = ""
+
+ data["padID"] = padid
+ p = padpath(padid, args.pub, args.group, args.fix_names)
+ if args.folder:
+ p = os.path.join(p, padid)
+
+ metapath = p + ".meta.json"
+ revisions = None
+ tries = 1
+ skip = False
+ padurlbase = re.sub(r"api/1.2.9/$", "p/", info["apiurl"])
+ meta = {}
+
+ while True:
+ try:
+ if os.path.exists(metapath):
+ async with await trio.open_file(metapath) as f:
+ contents = await f.read()
+ meta.update(json.loads(contents))
+ url = (
+ info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
+ )
+ response = await agetjson(session, url)
+ revisions = response["data"]["revisions"]
+ if meta["revisions"] == revisions and not args.force:
+ skip = True
+ reason = "No new revisions, we already have the latest local copy"
+ break
+
+ meta["padid"] = padid
+ versions = meta["versions"] = []
+ versions.append(
+ {"url": padurlbase + quote(padid), "type": "pad", "code": 200,}
+ )
+
+ if revisions is None:
+ url = (
+ info["localapiurl"] + "getRevisionsCount?" + urlencode(data)
+ )
+ response = await agetjson(session, url)
+ meta["revisions"] = response["data"]["revisions"]
+ else:
+ meta["revisions"] = revisions
+
+ if (meta["revisions"] == 0) and (not args.zerorevs):
+ skip = True
+ reason = "0 revisions, this pad was never edited"
+ break
+
+ # todo: load more metadata!
+ meta["group"], meta["pad"] = splitpadname(padid)
+ meta["pathbase"] = p
+
+ url = info["localapiurl"] + "getLastEdited?" + urlencode(data)
+ response = await agetjson(session, url)
+ meta["lastedited_raw"] = int(response["data"]["lastEdited"])
+
+ meta["lastedited_iso"] = datetime.fromtimestamp(
+ int(meta["lastedited_raw"]) / 1000
+ ).isoformat()
+
+ url = info["localapiurl"] + "listAuthorsOfPad?" + urlencode(data)
+ response = await agetjson(session, url)
+ meta["author_ids"] = response["data"]["authorIDs"]
+
+ break
+ except HTTPError as e:
+ tries += 1
+ if tries > 3:
+ print(
+ "Too many failures ({0}), skipping".format(padid),
+ file=sys.stderr,
+ )
+ skip = True
+ reason = "PANIC, couldn't download the pad contents"
+ break
+ else:
+ await trio.sleep(1)
+ except TypeError as e:
+ print(
+ "Type Error loading pad {0} (phantom pad?), skipping".format(
+ padid
+ ),
+ file=sys.stderr,
+ )
+ skip = True
+ reason = "PANIC, couldn't download the pad contents"
+ break
+
+ # Note(decentral1se): cannot track this bug down but basically the `data`
+ # and `padid` are getting out of sync and it is ending up that the same pad
+ # over and over again is downloaded. This resets things in a way that it
+ # works. This is a hack and one day TM I will find out how to fix it proper
+ data["padID"] = padid
+
+ if skip:
+ print("[ ] {} (skipped, reason: {})".format(padid, reason))
+ skipped += 1
+ return
+
+ if args.output:
+ print(padid)
+
+ if args.all or (args.meta or args.text or args.html or args.dhtml):
+ try:
+ path = trio.Path(os.path.split(metapath)[0])
+ if not os.path.exists(path):
+ await path.mkdir()
+ except OSError:
+ # Note(decentral1se): the path already exists
+ pass
+
+ if args.all or args.text:
+ url = info["localapiurl"] + "getText?" + urlencode(data)
+ text = await agetjson(session, url)
+ ver = {"type": "text"}
+ versions.append(ver)
+ ver["code"] = text["_code"]
+
+ if text["_code"] == 200:
+ text = text["data"]["text"]
+
+ ##########################################
+ ## ENFORCE __NOPUBLISH__ MAGIC WORD
+ ##########################################
+ if args.nopublish in text:
+ await try_deleting(
+ (
+ p + raw_ext,
+ p + ".raw.html",
+ p + ".diff.html",
+ p + ".meta.json",
+ )
+ )
+ print(
+ "[ ] {} (deleted, reason: explicit __NOPUBLISH__)".format(
+ padid
+ )
+ )
+ skipped += 1
+ return False
+
+ ##########################################
+ ## ENFORCE __PUBLISH__ MAGIC WORD
+ ##########################################
+ if args.publish_opt_in and args.publish not in text:
+ await try_deleting(
+ (
+ p + raw_ext,
+ p + ".raw.html",
+ p + ".diff.html",
+ p + ".meta.json",
+ )
+ )
+ print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
+ skipped += 1
+ return False
+
+ ver["path"] = p + raw_ext
+ ver["url"] = quote(ver["path"])
+ async with await trio.open_file(ver["path"], "w") as f:
+ try:
+ # Note(decentral1se): unicode handling...
+ safe_text = text.encode("utf-8", "replace").decode()
+ await f.write(safe_text)
+ except Exception as exception:
+ print("PANIC: {}".format(exception))
+
+ # once the content is settled, compute a hash
+ # and link it in the metadata!
+
+ # include magic words
+ if args.magicwords:
+
+ ##########################################
+ ## INCLUDE __XXX__ MAGIC WORDS
+ ##########################################
+ pattern = r"__[a-zA-Z0-9]+?__"
+ magic_words = re.findall(pattern, text)
+ magic_words = list(set(magic_words))
+ if magic_words:
+ meta["magicwords"] = magic_words
+ print(
+ "FOUND MAGIC WORD(s): {} in {}".format(
+ magic_words, padid
+ )
+ )
+
+ links = []
+ if args.css:
+ links.append({"href": args.css, "rel": "stylesheet"})
+ # todo, make this process reflect which files actually were made
+ versionbaseurl = quote(padid)
+ links.append(
+ {
+ "href": versions[0]["url"],
+ "rel": "alternate",
+ "type": "text/html",
+ "title": "Etherpad",
+ }
+ )
+ if args.all or args.text:
+ links.append(
+ {
+ "href": versionbaseurl + raw_ext,
+ "rel": "alternate",
+ "type": "text/plain",
+ "title": "Plain text",
+ }
+ )
+ if args.all or args.html:
+ links.append(
+ {
+ "href": versionbaseurl + ".raw.html",
+ "rel": "alternate",
+ "type": "text/html",
+ "title": "HTML",
+ }
+ )
+ if args.all or args.dhtml:
+ links.append(
+ {
+ "href": versionbaseurl + ".diff.html",
+ "rel": "alternate",
+ "type": "text/html",
+ "title": "HTML with author colors",
+ }
+ )
+ if args.all or args.meta:
+ links.append(
+ {
+ "href": versionbaseurl + ".meta.json",
+ "rel": "alternate",
+ "type": "application/json",
+ "title": "Meta data",
+ }
+ )
+
+ if args.all or args.dhtml:
+ data["startRev"] = "0"
+ url = info["localapiurl"] + "createDiffHTML?" + urlencode(data)
+ html = await agetjson(session, url)
+ ver = {"type": "diffhtml"}
+ versions.append(ver)
+ ver["code"] = html["_code"]
+ if html["_code"] == 200:
+ try:
+ html = html["data"]["html"]
+ ver["path"] = p + ".diff.html"
+ ver["url"] = quote(ver["path"])
+ doc = html5lib.parse(
+ html, treebuilder="etree", namespaceHTMLElements=False
+ )
+ html5tidy(
+ doc,
+ indent=True,
+ title=padid,
+ scripts=args.script,
+ links=links,
+ )
+ async with await trio.open_file(ver["path"], "w") as f:
+ output = ET.tostring(doc, method="html", encoding="unicode")
+ await f.write(output)
+ except TypeError:
+ ver["message"] = html["message"]
+
+ # Process text, html, dhtml, all options
+ if args.all or args.html:
+ # mb: line causing the error of not writing the correct HTML content to the correct HTML file:
+ # url = info["localapiurl"] + "getHTML?" + urlencode(data)
+ # mb: warning, HACK! Catching the error by writing the API request url manually ...
+ url = (
+ info["localapiurl"]
+ + "getHTML?"
+ + "padID="
+ + padid
+ + "&"
+ + "apikey="
+ + data["apikey"]
+ )
+ # print(url)
+ html = await agetjson(session, url)
+ ver = {"type": "html"}
+ versions.append(ver)
+ # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
+ # try:
+ ver["code"] = html["_code"]
+ if html["_code"] == 200:
+ try:
+ html = html["data"]["html"]
+ ver["path"] = p + ".raw.html"
+ ver["url"] = quote(ver["path"])
+ doc = html5lib.parse(
+ html, treebuilder="etree", namespaceHTMLElements=False
+ )
+ html5tidy(
+ doc,
+ indent=True,
+ title=padid,
+ scripts=args.script,
+ links=links,
+ )
+ async with await trio.open_file(ver["path"], "w") as f:
+ output = ET.tostring(doc, method="html", encoding="unicode")
+ await f.write(output)
+ except TypeError:
+ ver["message"] = html["message"]
+ # except Exception as exception:
+ # print("PANIC: {}".format(exception))
+
+ if args.all or args.magicwords:
+ url = (
+ info["localapiurl"]
+ + "getHTML?"
+ + "padID="
+ + padid
+ + "&"
+ + "apikey="
+ + data["apikey"]
+ )
+ # print(url)
+ html = await agetjson(session, url)
+ ver = {"type": "magicwords"}
+ versions.append(ver)
+ # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
+ # try:
+ ver["code"] = html["_code"]
+ if html["_code"] == 200:
+ try:
+ html = html["data"]["html"]
+ ver["path"] = p + ".magicwords.html"
+ ver["url"] = quote(ver["path"])
+ for magic_word in magic_words:
+ replace_word = (
+ "" + magic_word + ""
+ )
+ if magic_word in html:
+ html = html.replace(magic_word, replace_word)
+ doc = html5lib.parse(
+ html, treebuilder="etree", namespaceHTMLElements=False
+ )
+ # INSERT MAGIC WORDS HIGHLIGHTING STUFF HERE!!!
+ html5tidy(
+ doc,
+ indent=True,
+ title=padid,
+ scripts=args.script,
+ links=links,
+ )
+ async with await trio.open_file(ver["path"], "w") as f:
+ output = ET.tostring(doc, method="html", encoding="unicode")
+ await f.write(output)
+ except TypeError:
+ ver["message"] = html["message"]
+
+ # output meta
+ if args.all or args.meta:
+ ver = {"type": "meta"}
+ versions.append(ver)
+ ver["path"] = metapath
+ ver["url"] = quote(metapath)
+ async with await trio.open_file(metapath, "w") as f:
+ await f.write(json.dumps(meta))
+
+ print("[x] {} (saved)".format(padid))
+ saved += 1
+ return
async def handle_pads(args):
- global skipped, saved
+ global skipped, saved
- session = asks.Session(connections=args.connection)
- info = loadpadinfo(args.padinfo)
- data = {"apikey": info["apikey"]}
+ session = asks.Session(connections=args.connection)
+ info = loadpadinfo(args.padinfo)
+ data = {"apikey": info["apikey"]}
- padids = await get_padids(args, info, data, session)
- if args.skip:
- padids = padids[args.skip : len(padids)]
+ padids = await get_padids(args, info, data, session)
+ if args.skip:
+ padids = padids[args.skip : len(padids)]
- print("=" * 79)
- print("Etherpump is warming up the engines ...")
- print("=" * 79)
+ print("=" * 79)
+ print("Etherpump is warming up the engines ...")
+ print("=" * 79)
- start = time.time()
- async with trio.open_nursery() as nursery:
- for padid in padids:
- nursery.start_soon(handle_pad, args, padid, data, info, session)
- end = time.time()
- timeit = round(end - start, 2)
+ start = time.time()
+ async with trio.open_nursery() as nursery:
+ for padid in padids:
+ nursery.start_soon(handle_pad, args, padid, data, info, session)
+ end = time.time()
+ timeit = round(end - start, 2)
- print("=" * 79)
- print(
- "Processed {} :: Skipped {} :: Saved {} :: Time {}s".format(
- len(padids), skipped, saved, timeit
- )
- )
- print("=" * 79)
+ print("=" * 79)
+ print(
+ "Processed {} :: Skipped {} :: Saved {} :: Time {}s".format(
+ len(padids), skipped, saved, timeit
+ )
+ )
+ print("=" * 79)
def main(args):
- p = build_argument_parser(args)
- args = p.parse_args(args)
- trio.run(handle_pads, args)
+ p = build_argument_parser(args)
+ args = p.parse_args(args)
+ trio.run(handle_pads, args)