From 3a525f8b539c4f2cde8843fe41b2879b17d30f11 Mon Sep 17 00:00:00 2001 From: jules Date: Thu, 24 Sep 2020 12:42:32 +0200 Subject: [PATCH] added openreffine config file --- config-openrefine.py | 168 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 config-openrefine.py diff --git a/config-openrefine.py b/config-openrefine.py new file mode 100644 index 0000000..b193f4d --- /dev/null +++ b/config-openrefine.py @@ -0,0 +1,168 @@ +""" +This file defines a few constants which configure +which Wikibase instance and which property/item ids +should be used +""" + +# Endpoint of the MediaWiki API of the Wikibase instance +mediawiki_api_endpoint = 'https://daap.bannerrepeater.org/w/api.php' + +# SPARQL endpoint +wikibase_sparql_endpoint = 'https://query.daap.bannerrepeater.org/' + +# Wikibase namespace ID, used to search for items +# For Wikidata this is 0, but most by default Wikibase uses 120, which is the default Wikibase 'Item:' namespace +# CHANGE THIS TO 120 if you are adapting this configuration file to another Wikibase +wikibase_namespace_id = 120 + +# Namespace prefix of Wikibase items (including colon, e.g. 'Item:') +wikibase_namespace_prefix = '' + +# User agent to connect to the Wikidata APIs +user_agent = 'OpenRefine-Daap reconciliation interface' + +# Regexes and group ids to extracts Qids and Pids from URLs +import re +q_re = re.compile(r'(?') +q_re_group_id = 3 +p_re = re.compile(r'(?') +p_re_group_id = 3 + +# Identifier space and schema space exposed to OpenRefine. +# This should match the IRI prefixes used in RDF serialization. +# Note that you should be careful about using http or https there, +# because any variation will break comparisons at various places. +identifier_space = 'http://daap.bannerrepeater.org/entity/' +#"schema_space = 'http://www.wikidata.org/prop/direct/' +schema_space = 'http://daap.bannerrepeater.org/wiki/Property:' + +# Pattern used to form the URL of a Qid. +# This is only used for viewing so it is fine to use any protocol (therefore, preferably HTTPS if supported) +qid_url_pattern = 'https://daap.bannerrepeater.org/wiki/Item:{{id}}' + +# By default, filter out any items which are instance +# of a subclass of this class. +# For Wikidata, this is "Wikimedia internal stuff". +# This filters out the disambiguation pages, categories, ... +# Set to None to disable this filter +# avoid_items_of_class = 'Q17442446' +avoid_items_of_class = None + + +# Service name exposed at various places, +# mainly in the list of reconciliation services of users +service_name = 'DEV Daap' + +# URL (without the trailing slash) where this server runs +this_host = 'http://116.203.73.138:8000' + +# The default limit on the number of results returned by us +default_num_results = 25 + +# The maximum number of search results to retrieve from the Wikidata search API +wd_api_max_search_results = 50 # need a bot account to get more + +# The matching score above which we should automatically match an item +validation_threshold = 95 + +# Redis client used for caching at various places +import redis +redis_client = redis.Redis(host='redis', port=6379, db=0, decode_responses=True) + +# Redis prefix to use in front of all keys +redis_key_prefix = 'openrefine_daap:' + +# Headers for the HTTP requests made by the tool +headers = { + 'User-Agent':service_name + ' (OpenRefine-Daap reconciliation service)', +} + +# Previewing settings + +# Dimensions of the preview +zoom_ratio = 1.0 +preview_height = 100 +preview_width = 400 + +# With which should be requested from Commons for the thumbnail +thumbnail_width = 130 + +# All properties to use to get an image +image_properties = [ + 'P18', + 'P14', + 'P15', + 'P158', + 'P181', + 'P242', + 'P1766', + 'P1801', + 'P1846', + 'P2713', + 'P2716', + 'P2910', + 'P3311', + 'P3383', + 'P3451', + 'P1621', + 'P154', +] + +# URL pattern to retrieve an image from its filename +image_download_pattern = 'https://upload.wikimedia.org/wikipedia/commons/thumb/%s/%s/%s/%dpx-%s' + +# Fallback URL of the image to use when previewing an item with no image +fallback_image_url = this_host + '/static/wikidata.png' + +# Alt text of the fallback image +fallback_image_alt = 'Daap' + +# Autodescribe endpoint to use. +# this is used to generate automatic descriptions from item contents. +# (disable this with: autodescribe_endpoint = None ) +autodescribe_endpoint = None + +# Property proposal settings + +# Default type : entity (Q35120) +default_type_entity = 'Q1' + +# Property path used to obtain the type of an item +type_property_path = 'P31' + +# Property to follow to fetch properties for a given type +property_for_this_type_property = 'P1963' + +# Optional prefix in front of properties in SPARQL-like property paths +wdt_prefix = 'wdt:' + +# Sparql query used to fetch all the subclasses of a given item. +# The '$qid' string will be replaced by the qid whose children should be fetched. +sparql_query_to_fetch_subclasses = """ +SELECT ?child WHERE { ?child wdt:P279* wd:$qid } +""" + +# Sparql query used to fetch all the properties which store unique identifiers +sparql_query_to_fetch_unique_id_properties = """ +SELECT ?pid WHERE { ?pid wdt:P31/wdt:P279* wd:Q19847637 } +""" + +# Sparql query used to propose properties to fetch for items of a given class +sparql_query_to_propose_properties = """ +SELECT ?prop ?propLabel ?depth WHERE { +SERVICE gas:service { + gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.BFS" . + gas:program gas:in wd:$base_type . + gas:program gas:out ?out . + gas:program gas:out1 ?depth . + gas:program gas:maxIterations 10 . + gas:program gas:maxVisited 100 . + gas:program gas:linkType wdt:P279 . +} +SERVICE wikibase:label { bd:serviceParam wikibase:language "$lang" } +?out wdt:$property_for_this_type ?prop . +} +ORDER BY ?depth +LIMIT $limit +""" +