description text extracted

2020-11-15 23:05:07 +01:00 · 2020-11-15 23:05:07 +01:00 · 0d67d6e2ad
commit 0d67d6e2ad
parent 328793e6c1
1 changed files with 35 additions and 28 deletions
--- a/daapinterface.py
+++ b/daapinterface.py
@ -70,16 +70,16 @@ def home():
@app.route("/browsethearchive")
 def browsethearchive():
    sparql.setQuery('''
-        SELECT ?work ?workLabel ?image ?date WHERE {
-  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
-  ?work wdt:P1 wd:Q1.
-  OPTIONAL { ?work p:P30 ?statement.
+    SELECT ?work ?workLabel ?image ?date WHERE {
+    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
+    ?work wdt:P1 wd:Q1.
+    OPTIONAL { ?work p:P30 ?statement.
              ?statement ps:P30 ?image;
                         pq:P54 wd:Q90.}
-  OPTIONAL { ?work wdt:P13 ?date. }
-  FILTER(?work != wd:Q57)
-}
-ORDER BY (?workLabel)
+    OPTIONAL { ?work wdt:P13 ?date. }
+    FILTER(?work != wd:Q57)
+    }
+    ORDER BY (?workLabel)
    ''')
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
@ -295,7 +295,7 @@ def artwork():
    ''')
    sparql.setReturnFormat(JSON)
    artworkpublisher = sparql.query().convert()
-    print(artworkpublisher)
+    # print(artworkpublisher)


 #####right middle
@ -316,6 +316,7 @@ def artwork():
    sparql.setReturnFormat(JSON)
    artworkdescriptiondata = sparql.query().convert()
    # print("hello")
+    # print(artwork_id)
    # print(artworkdescriptiondata)
    # print("bye")

@ -323,38 +324,44 @@ def artwork():
    # print(type(dictionary))

    # get the description text or say there isn't any
-    # artworkdescriptiontext = None
+    artworkdescriptiontext = None

-    # for x in dictionary['results']['bindings']:
-    #     if "accessURLdescriptionPage" in x:
-    #         print("url for description present")
+    for x in dictionary['results']['bindings']:
+        if "accessURLdescriptionPage" in x:
+            print("url for description present")
+            print(x["accessURLdescriptionPage"]["value"])
+            accessURLdescriptionUrl = x["accessURLdescriptionPage"]["value"]
+            desc_url = re.search(r':Q(.*)', accessURLdescriptionUrl, re.DOTALL)
+            # print(desc_url.group(1))
+            desc_id=desc_url.group(1)
    #         # get the description content from wiki 
-    #         artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:"+artwork_id+"&action=render"
+            artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q"+desc_id+"&action=render"
    #         # Make a GET request to fetch the raw HTML content
-    #         html_content = requests.get(artworkdescriptioncontenturl).text
+            html_content = requests.get(artworkdescriptioncontenturl).text
    #         # Parse the html content
-    #         soup = BeautifulSoup(html_content, "lxml")
+            soup = BeautifulSoup(html_content, "lxml")
    #         # print(soup.prettify()) # print the parsed data of html
-    #         # text=soup.find("div" , {"class" : "mw-parser-output"})
+            text=soup.find("div" , {"class" : "mw-parser-output"})
    #         text=soup.find_all("p")
-    #         artworkdescriptiontext=Markup(text)
-    #     else:
-    #         print("url for description absent")
-    #         text="<p>Information not available</p>"
-    #         artworkdescriptiontext=Markup(text)
+            artworkdescriptiontext=Markup(text)
+        else:
+            print("url for description absent")
+            text="<p>Information not available</p>"
+            artworkdescriptiontext=Markup(text)
    
-    artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q427&action=render"
+    #description Q427 for testing purposes
+    # artworkdescriptioncontenturl = "https://daap.bannerrepeater.org/w/index.php?title=Description:Q427&action=render"
    # Make a GET request to fetch the raw HTML content
-    html_content = requests.get(artworkdescriptioncontenturl).text
+    # html_content = requests.get(artworkdescriptioncontenturl).text
    # Parse the html content
-    soup = BeautifulSoup(html_content, "lxml")
+    # soup = BeautifulSoup(html_content, "lxml")
    # print(soup.prettify()) # print the parsed data of html
-    text=soup.find("div" , {"class" : "mw-parser-output"})
+    # text=soup.find("div" , {"class" : "mw-parser-output"})
    # text=soup.find_all("p")
-    artworkdescriptiontext=Markup(text)
+    # artworkdescriptiontext=Markup(text)


-    print(artworkdescriptiontext)
+    # print(artworkdescriptiontext)

 ############ right bottom LATER
 # exhibitions + id to be changed