From 923cc11beb68dc75904cd20df47850a57d06f7bb Mon Sep 17 00:00:00 2001 From: manetta Date: Fri, 4 Dec 2020 14:22:23 +0100 Subject: [PATCH] work in progress around the magic words --- etherpump/commands/pull.py | 73 ++++++++++++++------------------------ 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/etherpump/commands/pull.py b/etherpump/commands/pull.py index ad44e66..98fce0e 100644 --- a/etherpump/commands/pull.py +++ b/etherpump/commands/pull.py @@ -173,6 +173,7 @@ def build_argument_parser(args): parser.add_argument( "--magic-words", default=False, + action="store_true", help="store all magic words used in a page in the meta.json file", ) return parser @@ -379,32 +380,10 @@ async def handle_pad(args, padid, data, info, session): ########################################## ## INCLUDE __XXX__ MAGIC WORDS ########################################## - pattern = r'[__\w+?__]' - magic_words = re.match(pattern, string) - magic_words = magic_words.groups() - print(magic_words) - if args.publish_opt_in and args.publish not in text: - await try_deleting( - ( - p + raw_ext, - p + ".raw.html", - p + ".diff.html", - p + ".meta.json", - ) - ) - print("[ ] {} (deleted, reason: publish opt-out)".format(padid)) - skipped += 1 - return False - - ver["path"] = p + raw_ext - ver["url"] = quote(ver["path"]) - async with await trio.open_file(ver["path"], "w") as f: - try: - # Note(decentral1se): unicode handling... - safe_text = text.encode("utf-8", "replace").decode() - await f.write(safe_text) - except Exception as exception: - print("PANIC: {}".format(exception)) + pattern = r'__[a-zA-Z0-9]+?__' + magic_words = re.findall(pattern, text) + if magic_words: + print('FOUND MAGIC WORD(s): {} in {}'.format(magic_words, padid)) links = [] if args.css: @@ -489,32 +468,32 @@ async def handle_pad(args, padid, data, info, session): # mb: line causing the error of not writing the correct HTML content to the correct HTML file: # url = info["localapiurl"] + "getHTML?" + urlencode(data) # mb: warning, HACK! Catching the error by writing the API request url manually ... - url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] + '&startRev=0' + url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] # print(url) html = await agetjson(session, url) ver = {"type": "html"} versions.append(ver) # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull. - try: - ver["code"] = html["_code"] - if html["_code"] == 200: - try: - html = html["data"]["html"] - ver["path"] = p + ".raw.html" - ver["url"] = quote(ver["path"]) - doc = html5lib.parse( - html, treebuilder="etree", namespaceHTMLElements=False - ) - html5tidy( - doc, indent=True, title=padid, scripts=args.script, links=links, - ) - async with await trio.open_file(ver["path"], "w") as f: - output = ET.tostring(doc, method="html", encoding="unicode") - await f.write(output) - except TypeError: - ver["message"] = html["message"] - except Exception as exception: - print("PANIC: {}".format(exception)) + # try: + ver["code"] = html["_code"] + if html["_code"] == 200: + try: + html = html["data"]["html"] + ver["path"] = p + ".raw.html" + ver["url"] = quote(ver["path"]) + doc = html5lib.parse( + html, treebuilder="etree", namespaceHTMLElements=False + ) + html5tidy( + doc, indent=True, title=padid, scripts=args.script, links=links, + ) + async with await trio.open_file(ver["path"], "w") as f: + output = ET.tostring(doc, method="html", encoding="unicode") + await f.write(output) + except TypeError: + ver["message"] = html["message"] + # except Exception as exception: + # print("PANIC: {}".format(exception)) # output meta if args.all or args.meta: