work in progress around the magic words

This commit is contained in:
manetta 2020-12-04 14:22:23 +01:00
parent 8fd2abf8f4
commit 923cc11beb

View File

@ -173,6 +173,7 @@ def build_argument_parser(args):
parser.add_argument( parser.add_argument(
"--magic-words", "--magic-words",
default=False, default=False,
action="store_true",
help="store all magic words used in a page in the meta.json file", help="store all magic words used in a page in the meta.json file",
) )
return parser return parser
@ -379,32 +380,10 @@ async def handle_pad(args, padid, data, info, session):
########################################## ##########################################
## INCLUDE __XXX__ MAGIC WORDS ## INCLUDE __XXX__ MAGIC WORDS
########################################## ##########################################
pattern = r'[__\w+?__]' pattern = r'__[a-zA-Z0-9]+?__'
magic_words = re.match(pattern, string) magic_words = re.findall(pattern, text)
magic_words = magic_words.groups() if magic_words:
print(magic_words) print('FOUND MAGIC WORD(s): {} in {}'.format(magic_words, padid))
if args.publish_opt_in and args.publish not in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
skipped += 1
return False
ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"])
async with await trio.open_file(ver["path"], "w") as f:
try:
# Note(decentral1se): unicode handling...
safe_text = text.encode("utf-8", "replace").decode()
await f.write(safe_text)
except Exception as exception:
print("PANIC: {}".format(exception))
links = [] links = []
if args.css: if args.css:
@ -489,32 +468,32 @@ async def handle_pad(args, padid, data, info, session):
# mb: line causing the error of not writing the correct HTML content to the correct HTML file: # mb: line causing the error of not writing the correct HTML content to the correct HTML file:
# url = info["localapiurl"] + "getHTML?" + urlencode(data) # url = info["localapiurl"] + "getHTML?" + urlencode(data)
# mb: warning, HACK! Catching the error by writing the API request url manually ... # mb: warning, HACK! Catching the error by writing the API request url manually ...
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] + '&startRev=0' url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"]
# print(url) # print(url)
html = await agetjson(session, url) html = await agetjson(session, url)
ver = {"type": "html"} ver = {"type": "html"}
versions.append(ver) versions.append(ver)
# mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull. # mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
try: # try:
ver["code"] = html["_code"] ver["code"] = html["_code"]
if html["_code"] == 200: if html["_code"] == 200:
try: try:
html = html["data"]["html"] html = html["data"]["html"]
ver["path"] = p + ".raw.html" ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"]) ver["url"] = quote(ver["path"])
doc = html5lib.parse( doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False html, treebuilder="etree", namespaceHTMLElements=False
) )
html5tidy( html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links, doc, indent=True, title=padid, scripts=args.script, links=links,
) )
async with await trio.open_file(ver["path"], "w") as f: async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode") output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output) await f.write(output)
except TypeError: except TypeError:
ver["message"] = html["message"] ver["message"] = html["message"]
except Exception as exception: # except Exception as exception:
print("PANIC: {}".format(exception)) # print("PANIC: {}".format(exception))
# output meta # output meta
if args.all or args.meta: if args.all or args.meta: