description text extracted

This commit is contained in:
jules 2020-11-15 23:05:07 +01:00
parent 328793e6c1
commit 0d67d6e2ad

View File

@ -70,16 +70,16 @@ def home():
@app.route("/browsethearchive") @app.route("/browsethearchive")
def browsethearchive(): def browsethearchive():
sparql.setQuery(''' sparql.setQuery('''
SELECT ?work ?workLabel ?image ?date WHERE { SELECT ?work ?workLabel ?image ?date WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
?work wdt:P1 wd:Q1. ?work wdt:P1 wd:Q1.
OPTIONAL { ?work p:P30 ?statement. OPTIONAL { ?work p:P30 ?statement.
?statement ps:P30 ?image; ?statement ps:P30 ?image;
pq:P54 wd:Q90.} pq:P54 wd:Q90.}
OPTIONAL { ?work wdt:P13 ?date. } OPTIONAL { ?work wdt:P13 ?date. }
FILTER(?work != wd:Q57) FILTER(?work != wd:Q57)
} }
ORDER BY (?workLabel) ORDER BY (?workLabel)
''') ''')
sparql.setReturnFormat(JSON) sparql.setReturnFormat(JSON)
results = sparql.query().convert() results = sparql.query().convert()
@ -295,7 +295,7 @@ def artwork():
''') ''')
sparql.setReturnFormat(JSON) sparql.setReturnFormat(JSON)
artworkpublisher = sparql.query().convert() artworkpublisher = sparql.query().convert()
print(artworkpublisher) # print(artworkpublisher)
#####right middle #####right middle
@ -316,6 +316,7 @@ def artwork():
sparql.setReturnFormat(JSON) sparql.setReturnFormat(JSON)
artworkdescriptiondata = sparql.query().convert() artworkdescriptiondata = sparql.query().convert()
# print("hello") # print("hello")
# print(artwork_id)
# print(artworkdescriptiondata) # print(artworkdescriptiondata)
# print("bye") # print("bye")
@ -323,38 +324,44 @@ def artwork():
# print(type(dictionary)) # print(type(dictionary))
# get the description text or say there isn't any # get the description text or say there isn't any
# artworkdescriptiontext = None artworkdescriptiontext = None
# for x in dictionary['results']['bindings']: for x in dictionary['results']['bindings']:
# if "accessURLdescriptionPage" in x: if "accessURLdescriptionPage" in x:
# print("url for description present") print("url for description present")
print(x["accessURLdescriptionPage"]["value"])
accessURLdescriptionUrl = x["accessURLdescriptionPage"]["value"]
desc_url = re.search(r':Q(.*)', accessURLdescriptionUrl, re.DOTALL)
# print(desc_url.group(1))
desc_id=desc_url.group(1)
# # get the description content from wiki # # get the description content from wiki
# artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:"+artwork_id+"&action=render" artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q"+desc_id+"&action=render"
# # Make a GET request to fetch the raw HTML content # # Make a GET request to fetch the raw HTML content
# html_content = requests.get(artworkdescriptioncontenturl).text html_content = requests.get(artworkdescriptioncontenturl).text
# # Parse the html content # # Parse the html content
# soup = BeautifulSoup(html_content, "lxml") soup = BeautifulSoup(html_content, "lxml")
# # print(soup.prettify()) # print the parsed data of html # # print(soup.prettify()) # print the parsed data of html
# # text=soup.find("div" , {"class" : "mw-parser-output"}) text=soup.find("div" , {"class" : "mw-parser-output"})
# text=soup.find_all("p") # text=soup.find_all("p")
# artworkdescriptiontext=Markup(text) artworkdescriptiontext=Markup(text)
# else: else:
# print("url for description absent") print("url for description absent")
# text="<p>Information not available</p>" text="<p>Information not available</p>"
# artworkdescriptiontext=Markup(text) artworkdescriptiontext=Markup(text)
artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q427&action=render" #description Q427 for testing purposes
# artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q427&action=render"
# Make a GET request to fetch the raw HTML content # Make a GET request to fetch the raw HTML content
html_content = requests.get(artworkdescriptioncontenturl).text # html_content = requests.get(artworkdescriptioncontenturl).text
# Parse the html content # Parse the html content
soup = BeautifulSoup(html_content, "lxml") # soup = BeautifulSoup(html_content, "lxml")
# print(soup.prettify()) # print the parsed data of html # print(soup.prettify()) # print the parsed data of html
text=soup.find("div" , {"class" : "mw-parser-output"}) # text=soup.find("div" , {"class" : "mw-parser-output"})
# text=soup.find_all("p") # text=soup.find_all("p")
artworkdescriptiontext=Markup(text) # artworkdescriptiontext=Markup(text)
print(artworkdescriptiontext) # print(artworkdescriptiontext)
############ right bottom LATER ############ right bottom LATER
# exhibitions + id to be changed # exhibitions + id to be changed