You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
121 lines
3.2 KiB
121 lines
3.2 KiB
"""Palanggana is a small script to get plaintext from etherpump
|
|
bases on the magicwords used there"""
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests as req
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Get some text from the pump.")
|
|
parser.add_argument(
|
|
"-m",
|
|
"--magic",
|
|
required=True,
|
|
help="Add your magic word here",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-k",
|
|
"--keep",
|
|
help="keep the magic words",
|
|
action="store_true",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-p",
|
|
"--pad",
|
|
help="specify a pad name, get only that pad",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-a",
|
|
"--all",
|
|
help="get all content from pad not just the lines containing the magic",
|
|
action="store_true",
|
|
)
|
|
|
|
|
|
# just a few variables that we will need
|
|
thepump = "https://etherpump.vvvvvvaria.org/"
|
|
args = parser.parse_args()
|
|
magicword = "__{0}__".format(f"{args.magic}".upper())
|
|
padname = f"{args.pad}"
|
|
keep = args.keep
|
|
wholepad = args.all
|
|
|
|
|
|
def extractlinks(thewell):
|
|
"""extract the links to the plain text documents from etherpump"""
|
|
thewater = BeautifulSoup(thewell, "lxml")
|
|
thegoods = []
|
|
for link in thewater.find_all("a", href=True):
|
|
if "raw.txt" in link["href"]:
|
|
textlink = "{0}{1}".format(thepump, link["href"])
|
|
thegoods.append(textlink)
|
|
|
|
return thegoods
|
|
|
|
|
|
def extracttext(thegoods):
|
|
"""extract the texts from the plain text links"""
|
|
plaintexts = []
|
|
for link in thegoods:
|
|
if padname and padname != "None":
|
|
strippedlink = (
|
|
link.replace(thepump, "")
|
|
.replace(".raw.txt", "")
|
|
.replace("p/", "")
|
|
)
|
|
if padname.lower() == strippedlink.lower():
|
|
res = req.get(link)
|
|
# print("pad found {0} with link {1}".format(padname, link))
|
|
plaintexts.append(res.text.splitlines())
|
|
return plaintexts
|
|
else:
|
|
res = req.get(link)
|
|
plaintexts.append(res.text.splitlines())
|
|
return plaintexts
|
|
|
|
|
|
def extractmagic(plaintexts):
|
|
"""extract the lines that contain the magic word, keep the magic or not"""
|
|
magiclines = []
|
|
# print(magicword)
|
|
for line in plaintexts:
|
|
if not wholepad:
|
|
if magicword in line:
|
|
if not keep:
|
|
line = line.replace(magicword, "")
|
|
magiclines.append(line)
|
|
else:
|
|
if not keep:
|
|
line = line.replace(magicword, "")
|
|
magiclines.append(line)
|
|
return magiclines
|
|
|
|
|
|
def findthewell(thepump):
|
|
"""does the well have the magic you are looking for"""
|
|
resp = req.get(thepump)
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
thewell = soup.find("div", id=magicword)
|
|
return thewell
|
|
|
|
|
|
thewell = findthewell(thepump)
|
|
|
|
if thewell is not None:
|
|
thewell = thewell.prettify()
|
|
thegoods = extractlinks(thewell)
|
|
else:
|
|
print("Magic word {0} not found".format(magicword))
|
|
quit()
|
|
|
|
if thegoods is not None:
|
|
magiclines = []
|
|
plaintexts = extracttext(thegoods)
|
|
for plaintext in plaintexts:
|
|
magiclines.extend(extractmagic(plaintext))
|
|
|
|
"""all is done, these are the texts you are looking for"""
|
|
print("\n".join(magiclines))
|
|
|