From 8227d75d28144a4daaa4ceedd2c4c4ce9ea7d5e2 Mon Sep 17 00:00:00 2001 From: Luke Murphy Date: Sat, 12 Dec 2020 16:29:38 +0100 Subject: [PATCH] Try to solve padID bug and refactor WIP solutions By passing data.copy() I hope to stop the overwriting. I have removed the try/except work-arounds here because it seems to not be necessary now. --- etherpump/commands/pull.py | 128 ++++++++++++++----------------------- 1 file changed, 47 insertions(+), 81 deletions(-) diff --git a/etherpump/commands/pull.py b/etherpump/commands/pull.py index 141af1c..3948089 100644 --- a/etherpump/commands/pull.py +++ b/etherpump/commands/pull.py @@ -295,12 +295,6 @@ async def handle_pad(args, padid, data, info, session): reason = "PANIC, couldn't download the pad contents" break - # Note(decentral1se): cannot track this bug down but basically the `data` - # and `padid` are getting out of sync and it is ending up that the same pad - # over and over again is downloaded. This resets things in a way that it - # works. This is a hack and one day TM I will find out how to fix it proper - data["padID"] = padid - if skip: print("[ ] {} (skipped, reason: {})".format(padid, reason)) skipped += 1 @@ -466,93 +460,63 @@ async def handle_pad(args, padid, data, info, session): ver["message"] = html["message"] # Process text, html, dhtml, magicwords and all options + downloaded_html = False if args.all or args.html: - # mb: line causing the error of not writing the correct HTML content to the correct HTML file: - # url = info["localapiurl"] + "getHTML?" + urlencode(data) - # mb: warning, HACK! Catching the error by writing the API request url manually ... - url = ( - info["localapiurl"] - + "getHTML?" - + "padID=" - + padid - + "&" - + "apikey=" - + data["apikey"] - ) - # print(url) + url = info["localapiurl"] + "getHTML?" + urlencode(data) html = await agetjson(session, url) ver = {"type": "html"} versions.append(ver) - # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull. - # try: ver["code"] = html["_code"] + downloaded_html = True + if html["_code"] == 200: - try: - html = html["data"]["html"] - ver["path"] = p + ".raw.html" - ver["url"] = quote(ver["path"]) - doc = html5lib.parse( - html, treebuilder="etree", namespaceHTMLElements=False - ) - html5tidy( - doc, - indent=True, - title=padid, - scripts=args.script, - links=links, - ) - async with await trio.open_file(ver["path"], "w") as f: - output = ET.tostring(doc, method="html", encoding="unicode") - await f.write(output) - except TypeError: - ver["message"] = html["message"] - # except Exception as exception: - # print("PANIC: {}".format(exception)) + html_body = html["data"]["html"] + ver["path"] = p + ".raw.html" + ver["url"] = quote(ver["path"]) + doc = html5lib.parse( + html_body, treebuilder="etree", namespaceHTMLElements=False + ) + html5tidy( + doc, + indent=True, + title=padid, + scripts=args.script, + links=links, + ) + async with await trio.open_file(ver["path"], "w") as f: + output = ET.tostring(doc, method="html", encoding="unicode") + await f.write(output) if args.all or args.magicwords: - url = ( - info["localapiurl"] - + "getHTML?" - + "padID=" - + padid - + "&" - + "apikey=" - + data["apikey"] - ) - # print(url) - html = await agetjson(session, url) + if not downloaded_html: + html = await agetjson(session, url) ver = {"type": "magicwords"} versions.append(ver) - # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull. - # try: ver["code"] = html["_code"] + if html["_code"] == 200: - try: - html = html["data"]["html"] - ver["path"] = p + ".magicwords.html" - ver["url"] = quote(ver["path"]) - for magic_word in magic_words: - replace_word = ( - "" + magic_word + "" - ) - if magic_word in html: - html = html.replace(magic_word, replace_word) - doc = html5lib.parse( - html, treebuilder="etree", namespaceHTMLElements=False - ) - # INSERT MAGIC WORDS HIGHLIGHTING STUFF HERE!!! - html5tidy( - doc, - indent=True, - title=padid, - scripts=args.script, - links=links, + html_body = html["data"]["html"] + ver["path"] = p + ".magicwords.html" + ver["url"] = quote(ver["path"]) + for magic_word in magic_words: + replace_word = ( + "" + magic_word + "" ) - async with await trio.open_file(ver["path"], "w") as f: - output = ET.tostring(doc, method="html", encoding="unicode") - await f.write(output) - except TypeError: - ver["message"] = html["message"] + if magic_word in html_body: + html_body = html_body.replace(magic_word, replace_word) + doc = html5lib.parse( + html_body, treebuilder="etree", namespaceHTMLElements=False + ) + html5tidy( + doc, + indent=True, + title=padid, + scripts=args.script, + links=links, + ) + async with await trio.open_file(ver["path"], "w") as f: + output = ET.tostring(doc, method="html", encoding="unicode") + await f.write(output) # output meta if args.all or args.meta: @@ -586,7 +550,9 @@ async def handle_pads(args): start = time.time() async with trio.open_nursery() as nursery: for padid in padids: - nursery.start_soon(handle_pad, args, padid, data, info, session) + nursery.start_soon( + handle_pad, args, padid, data.copy(), info, session + ) end = time.time() timeit = round(end - start, 2)