commit aa26c949714d98b1e2bcb6699b924aff0464a6ea Author: crunk Date: Wed Dec 30 15:42:12 2020 +0100 first commit of the palanggana, get water(plaintext) from the (ether)pump diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/palanggana.py b/palanggana.py new file mode 100644 index 0000000..daaabd8 --- /dev/null +++ b/palanggana.py @@ -0,0 +1,87 @@ +"""Palanggana is a small script to get plaintext from etherpump +bases on the magicwords used there""" + + +from bs4 import BeautifulSoup +import requests as req +import argparse + +parser = argparse.ArgumentParser(description="Get some text from the pump.") +parser.add_argument( + "-m", + "--magic", + required=True, + help="Add your magic word here", +) +parser.add_argument( + "-k", + "--keep", + help="keep the magic words", + action="store_true", +) + +# just a few variables that we will need +thepump = "https://etherpump.vvvvvvaria.org/" +args = parser.parse_args() +magicword = "__{0}__".format(f"{args.magic}".upper()) +keep = args.keep + + +def extractlinks(thewell): + """extract the links to the plain text documents from etherpump""" + thewater = BeautifulSoup(thewell, "lxml") + thegoods = [] + for link in thewater.find_all("a", href=True): + if "raw.txt" in link["href"]: + textlink = "{0}{1}".format(thepump, link["href"]) + thegoods.append(textlink) + + return thegoods + + +def extracttext(thegoods): + """extract the texts from the plain text links""" + plaintexts = [] + for link in thegoods: + res = req.get(link) + plaintexts.append(res.text.splitlines()) + return plaintexts + + +def extractmagic(plaintexts): + """extract the lines that contain the magic word, keep the magic or not""" + magiclines = [] + # print(magicword) + for line in plaintexts: + if magicword in line: + if not keep: + line = line.replace(magicword, "") + magiclines.append(line) + return magiclines + + +def findthewell(thepump): + """does the well have the magic you are looking for""" + resp = req.get(thepump) + soup = BeautifulSoup(resp.text, "lxml") + thewell = soup.find("div", id=magicword) + return thewell + + +thewell = findthewell(thepump) + +if thewell is not None: + thewell = thewell.prettify() + thegoods = extractlinks(thewell) +else: + print("Magic word {0} not found".format(magicword)) + quit() + +if thegoods is not None: + magiclines = [] + plaintexts = extracttext(thegoods) + for plaintext in plaintexts: + magiclines.extend(extractmagic(plaintext)) + +"""all is done, these are the texts you are looking for""" +print("\n".join(magiclines))