diff --git a/etherpump/commands/pull.py b/etherpump/commands/pull.py
index 141af1c..3948089 100644
--- a/etherpump/commands/pull.py
+++ b/etherpump/commands/pull.py
@@ -295,12 +295,6 @@ async def handle_pad(args, padid, data, info, session):
reason = "PANIC, couldn't download the pad contents"
break
- # Note(decentral1se): cannot track this bug down but basically the `data`
- # and `padid` are getting out of sync and it is ending up that the same pad
- # over and over again is downloaded. This resets things in a way that it
- # works. This is a hack and one day TM I will find out how to fix it proper
- data["padID"] = padid
-
if skip:
print("[ ] {} (skipped, reason: {})".format(padid, reason))
skipped += 1
@@ -466,93 +460,63 @@ async def handle_pad(args, padid, data, info, session):
ver["message"] = html["message"]
# Process text, html, dhtml, magicwords and all options
+ downloaded_html = False
if args.all or args.html:
- # mb: line causing the error of not writing the correct HTML content to the correct HTML file:
- # url = info["localapiurl"] + "getHTML?" + urlencode(data)
- # mb: warning, HACK! Catching the error by writing the API request url manually ...
- url = (
- info["localapiurl"]
- + "getHTML?"
- + "padID="
- + padid
- + "&"
- + "apikey="
- + data["apikey"]
- )
- # print(url)
+ url = info["localapiurl"] + "getHTML?" + urlencode(data)
html = await agetjson(session, url)
ver = {"type": "html"}
versions.append(ver)
- # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
- # try:
ver["code"] = html["_code"]
+ downloaded_html = True
+
if html["_code"] == 200:
- try:
- html = html["data"]["html"]
- ver["path"] = p + ".raw.html"
- ver["url"] = quote(ver["path"])
- doc = html5lib.parse(
- html, treebuilder="etree", namespaceHTMLElements=False
- )
- html5tidy(
- doc,
- indent=True,
- title=padid,
- scripts=args.script,
- links=links,
- )
- async with await trio.open_file(ver["path"], "w") as f:
- output = ET.tostring(doc, method="html", encoding="unicode")
- await f.write(output)
- except TypeError:
- ver["message"] = html["message"]
- # except Exception as exception:
- # print("PANIC: {}".format(exception))
+ html_body = html["data"]["html"]
+ ver["path"] = p + ".raw.html"
+ ver["url"] = quote(ver["path"])
+ doc = html5lib.parse(
+ html_body, treebuilder="etree", namespaceHTMLElements=False
+ )
+ html5tidy(
+ doc,
+ indent=True,
+ title=padid,
+ scripts=args.script,
+ links=links,
+ )
+ async with await trio.open_file(ver["path"], "w") as f:
+ output = ET.tostring(doc, method="html", encoding="unicode")
+ await f.write(output)
if args.all or args.magicwords:
- url = (
- info["localapiurl"]
- + "getHTML?"
- + "padID="
- + padid
- + "&"
- + "apikey="
- + data["apikey"]
- )
- # print(url)
- html = await agetjson(session, url)
+ if not downloaded_html:
+ html = await agetjson(session, url)
ver = {"type": "magicwords"}
versions.append(ver)
- # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
- # try:
ver["code"] = html["_code"]
+
if html["_code"] == 200:
- try:
- html = html["data"]["html"]
- ver["path"] = p + ".magicwords.html"
- ver["url"] = quote(ver["path"])
- for magic_word in magic_words:
- replace_word = (
- "" + magic_word + ""
- )
- if magic_word in html:
- html = html.replace(magic_word, replace_word)
- doc = html5lib.parse(
- html, treebuilder="etree", namespaceHTMLElements=False
- )
- # INSERT MAGIC WORDS HIGHLIGHTING STUFF HERE!!!
- html5tidy(
- doc,
- indent=True,
- title=padid,
- scripts=args.script,
- links=links,
+ html_body = html["data"]["html"]
+ ver["path"] = p + ".magicwords.html"
+ ver["url"] = quote(ver["path"])
+ for magic_word in magic_words:
+ replace_word = (
+ "" + magic_word + ""
)
- async with await trio.open_file(ver["path"], "w") as f:
- output = ET.tostring(doc, method="html", encoding="unicode")
- await f.write(output)
- except TypeError:
- ver["message"] = html["message"]
+ if magic_word in html_body:
+ html_body = html_body.replace(magic_word, replace_word)
+ doc = html5lib.parse(
+ html_body, treebuilder="etree", namespaceHTMLElements=False
+ )
+ html5tidy(
+ doc,
+ indent=True,
+ title=padid,
+ scripts=args.script,
+ links=links,
+ )
+ async with await trio.open_file(ver["path"], "w") as f:
+ output = ET.tostring(doc, method="html", encoding="unicode")
+ await f.write(output)
# output meta
if args.all or args.meta:
@@ -586,7 +550,9 @@ async def handle_pads(args):
start = time.time()
async with trio.open_nursery() as nursery:
for padid in padids:
- nursery.start_soon(handle_pad, args, padid, data, info, session)
+ nursery.start_soon(
+ handle_pad, args, padid, data.copy(), info, session
+ )
end = time.time()
timeit = round(end - start, 2)