A webscraper that gets the magic from the etherpump. (palanggana means washbasin in Filipino. )
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
3.2 KiB

"""Palanggana is a small script to get plaintext from etherpump
bases on the magicwords used there"""
from bs4 import BeautifulSoup
import requests as req
import argparse
parser = argparse.ArgumentParser(description="Get some text from the pump.")
parser.add_argument(
"-m",
"--magic",
required=True,
help="Add your magic word here",
)
parser.add_argument(
"-k",
"--keep",
help="keep the magic words",
action="store_true",
)
parser.add_argument(
"-p",
"--pad",
help="specify a pad name, get only that pad",
)
parser.add_argument(
"-a",
"--all",
help="get all content from pad not just the lines containing the magic",
action="store_true",
)
# just a few variables that we will need
thepump = "https://etherpump.vvvvvvaria.org/"
args = parser.parse_args()
magicword = "__{0}__".format(f"{args.magic}".upper())
padname = f"{args.pad}"
keep = args.keep
wholepad = args.all
def extractlinks(thewell):
"""extract the links to the plain text documents from etherpump"""
thewater = BeautifulSoup(thewell, "lxml")
thegoods = []
for link in thewater.find_all("a", href=True):
if "raw.txt" in link["href"]:
textlink = "{0}{1}".format(thepump, link["href"])
thegoods.append(textlink)
return thegoods
def extracttext(thegoods):
"""extract the texts from the plain text links"""
plaintexts = []
for link in thegoods:
if padname and padname != "None":
strippedlink = (
link.replace(thepump, "")
.replace(".raw.txt", "")
.replace("p/", "")
)
if padname.lower() == strippedlink.lower():
res = req.get(link)
# print("pad found {0} with link {1}".format(padname, link))
plaintexts.append(res.text.splitlines())
return plaintexts
else:
res = req.get(link)
plaintexts.append(res.text.splitlines())
return plaintexts
def extractmagic(plaintexts):
"""extract the lines that contain the magic word, keep the magic or not"""
magiclines = []
# print(magicword)
for line in plaintexts:
if not wholepad:
if magicword in line:
if not keep:
line = line.replace(magicword, "")
magiclines.append(line)
else:
if not keep:
line = line.replace(magicword, "")
magiclines.append(line)
return magiclines
def findthewell(thepump):
"""does the well have the magic you are looking for"""
resp = req.get(thepump)
soup = BeautifulSoup(resp.text, "lxml")
thewell = soup.find("div", id=magicword)
return thewell
thewell = findthewell(thepump)
if thewell is not None:
thewell = thewell.prettify()
thegoods = extractlinks(thewell)
else:
print("Magic word {0} not found".format(magicword))
quit()
if thegoods is not None:
magiclines = []
plaintexts = extracttext(thegoods)
for plaintext in plaintexts:
magiclines.extend(extractmagic(plaintext))
"""all is done, these are the texts you are looking for"""
print("\n".join(magiclines))