work in progress around the magic words

This commit is contained in:
manetta 2020-12-04 14:22:23 +01:00
parent 8fd2abf8f4
commit 923cc11beb

View File

@ -173,6 +173,7 @@ def build_argument_parser(args):
parser.add_argument(
"--magic-words",
default=False,
action="store_true",
help="store all magic words used in a page in the meta.json file",
)
return parser
@ -379,32 +380,10 @@ async def handle_pad(args, padid, data, info, session):
##########################################
## INCLUDE __XXX__ MAGIC WORDS
##########################################
pattern = r'[__\w+?__]'
magic_words = re.match(pattern, string)
magic_words = magic_words.groups()
print(magic_words)
if args.publish_opt_in and args.publish not in text:
await try_deleting(
(
p + raw_ext,
p + ".raw.html",
p + ".diff.html",
p + ".meta.json",
)
)
print("[ ] {} (deleted, reason: publish opt-out)".format(padid))
skipped += 1
return False
ver["path"] = p + raw_ext
ver["url"] = quote(ver["path"])
async with await trio.open_file(ver["path"], "w") as f:
try:
# Note(decentral1se): unicode handling...
safe_text = text.encode("utf-8", "replace").decode()
await f.write(safe_text)
except Exception as exception:
print("PANIC: {}".format(exception))
pattern = r'__[a-zA-Z0-9]+?__'
magic_words = re.findall(pattern, text)
if magic_words:
print('FOUND MAGIC WORD(s): {} in {}'.format(magic_words, padid))
links = []
if args.css:
@ -489,32 +468,32 @@ async def handle_pad(args, padid, data, info, session):
# mb: line causing the error of not writing the correct HTML content to the correct HTML file:
# url = info["localapiurl"] + "getHTML?" + urlencode(data)
# mb: warning, HACK! Catching the error by writing the API request url manually ...
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"] + '&startRev=0'
url = info["localapiurl"] + "getHTML?" + "padID=" + padid + "&" + 'apikey=' + data["apikey"]
# print(url)
html = await agetjson(session, url)
ver = {"type": "html"}
versions.append(ver)
# mb: warning, HACK! Added a Try and Except here, as it sometimes bumped into an error, stopping the pull.
try:
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html["data"]["html"]
ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
except TypeError:
ver["message"] = html["message"]
except Exception as exception:
print("PANIC: {}".format(exception))
# try:
ver["code"] = html["_code"]
if html["_code"] == 200:
try:
html = html["data"]["html"]
ver["path"] = p + ".raw.html"
ver["url"] = quote(ver["path"])
doc = html5lib.parse(
html, treebuilder="etree", namespaceHTMLElements=False
)
html5tidy(
doc, indent=True, title=padid, scripts=args.script, links=links,
)
async with await trio.open_file(ver["path"], "w") as f:
output = ET.tostring(doc, method="html", encoding="unicode")
await f.write(output)
except TypeError:
ver["message"] = html["message"]
# except Exception as exception:
# print("PANIC: {}".format(exception))
# output meta
if args.all or args.meta: