|
|
@ -173,6 +173,7 @@ def build_argument_parser(args): |
|
|
|
parser.add_argument( |
|
|
|
"--magic-words", |
|
|
|
default=False, |
|
|
|
action="store_true", |
|
|
|
help="store all magic words used in a page in the meta.json file", |
|
|
|
) |
|
|
|
return parser |
|
|
@ -379,32 +380,10 @@ async def handle_pad(args, padid, data, info, session): |
|
|
|
########################################## |
|
|
|
## INCLUDE __XXX__ MAGIC WORDS |
|
|
|
########################################## |
|
|
|
pattern = r'[__\w+?__]' |
|
|
|
magic_words = re.match(pattern, string) |
|
|
|
magic_words = magic_words.groups() |
|
|
|
print(magic_words) |
|
|
|
if args.publish_opt_in and args.publish not in text: |
|
|
|
await try_deleting( |
|
|
|
( |
|
|
|
p + raw_ext, |
|
|
|
p + ".raw.html", |
|
|
|
p + ".diff.html", |
|
|
|
p + ".meta.json", |
|
|
|
) |
|
|
|
) |
|
|
|
print("[ ] {} (deleted, reason: publish opt-out)".format(padid)) |
|
|
|
skipped += 1 |
|
|
|
return False |
|
|
|
|
|
|
|
ver["path"] = p + raw_ext |
|
|
|
ver["url"] = quote(ver["path"]) |
|
|
|
async with await trio.open_file(ver["path"], "w") as f: |
|
|
|
try: |
|
|
|
# Note(decentral1se): unicode handling... |
|
|
|
safe_text = text.encode("utf-8", "replace").decode() |
|
|
|
await f.write(safe_text) |
|
|
|
except Exception as exception: |
|
|
|
print("PANIC: {}".format(exception)) |
|
|
|
pattern = r'__[a-zA-Z0-9]+?__' |
|
|
|
magic_words = re.findall(pattern, text) |
|
|
|
if magic_words: |
|
|
|
print('FOUND MAGIC WORD(s): {} in {}'.format(magic_words, padid)) |
|
|
|
|
|
|
|
links = [] |
|
|
|
if args.css: |
|
|
@ -489,32 +468,32 @@ async def handle_pad(args, padid, data, info, session): |
|
|
|
# mb: line causing the error of not writing the correct HTML content to the correct HTML file: |
|
|
|
# url = info["localapiurl"] + "getHTML?" + urlencode(data) |
|
|
|
# mb: warning, HACK! Catching the error by writing the API request url manually ... |
|
|
|
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] + '&startRev=0' |
|
|
|
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] |
|
|
|
# print(url) |
|
|
|
html = await agetjson(session, url) |
|
|
|
ver = {"type": "html"} |
|
|
|
versions.append(ver) |
|
|
|
# mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull. |
|
|
|
try: |
|
|
|
ver["code"] = html["_code"] |
|
|
|
if html["_code"] == 200: |
|
|
|
try: |
|
|
|
html = html["data"]["html"] |
|
|
|
ver["path"] = p + ".raw.html" |
|
|
|
ver["url"] = quote(ver["path"]) |
|
|
|
doc = html5lib.parse( |
|
|
|
html, treebuilder="etree", namespaceHTMLElements=False |
|
|
|
) |
|
|
|
html5tidy( |
|
|
|
doc, indent=True, title=padid, scripts=args.script, links=links, |
|
|
|
) |
|
|
|
async with await trio.open_file(ver["path"], "w") as f: |
|
|
|
output = ET.tostring(doc, method="html", encoding="unicode") |
|
|
|
await f.write(output) |
|
|
|
except TypeError: |
|
|
|
ver["message"] = html["message"] |
|
|
|
except Exception as exception: |
|
|
|
print("PANIC: {}".format(exception)) |
|
|
|
# try: |
|
|
|
ver["code"] = html["_code"] |
|
|
|
if html["_code"] == 200: |
|
|
|
try: |
|
|
|
html = html["data"]["html"] |
|
|
|
ver["path"] = p + ".raw.html" |
|
|
|
ver["url"] = quote(ver["path"]) |
|
|
|
doc = html5lib.parse( |
|
|
|
html, treebuilder="etree", namespaceHTMLElements=False |
|
|
|
) |
|
|
|
html5tidy( |
|
|
|
doc, indent=True, title=padid, scripts=args.script, links=links, |
|
|
|
) |
|
|
|
async with await trio.open_file(ver["path"], "w") as f: |
|
|
|
output = ET.tostring(doc, method="html", encoding="unicode") |
|
|
|
await f.write(output) |
|
|
|
except TypeError: |
|
|
|
ver["message"] = html["message"] |
|
|
|
# except Exception as exception: |
|
|
|
# print("PANIC: {}".format(exception)) |
|
|
|
|
|
|
|
# output meta |
|
|
|
if args.all or args.meta: |
|
|
|