short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language)

5 years ago · 1fda8c7925
4 changed files with 27 additions and 12 deletions
--- a/cross-reader.tfidf/readings.py
+++ b/cross-reader.tfidf/readings.py
@ -119,11 +119,11 @@ def request_results(query):
 	# 	}
 	# }

-	# First, sort the matching documents on TF-IDF values
+	# First, check which documents use the query 
 	order = []
 	for document, _ in index.items():
 		for key in index[document]['tfidf'].keys():
-			if query == key.lower():
+			if query == key.strip().lower():
 				print('Query match:', query)
 				match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
 				order.append(match)
--- a/cross-reader.tfidf/start.py
+++ b/cross-reader.tfidf/start.py
@ -49,7 +49,7 @@ def index():
@APP.route('/document/<name>', methods=['GET', 'POST'])
 def open_document(name):
 	""" 
-		Open document.
+		Read document.
 	"""
 	index = readings.load_index()
 	suggestions = open('words.txt', 'r').readlines()
@ -94,4 +94,4 @@ if __name__ == '__main__':
 	if not 'index.json' in os.listdir('.'):
 		tfidf.create_index()
 	APP.debug=True
-	APP.run()
+	APP.run(port=5001)
--- a/cross-reader.tfidf/tfidf.py
+++ b/cross-reader.tfidf/tfidf.py
@ -37,7 +37,11 @@ def tfidf(query, words, corpus):
 	return tf_count, idf_count, tfidf_value 

 def get_language(document):
-	language = re.search(r'\[.*\]', document, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()
+	match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
+	if match:
+		language = match.group().replace('[','').replace(']','').lower()
+	else:
+		language = 'undefined'
 	return language

 def load_text_files():
@ -59,6 +63,7 @@ def load_text_files():
 		sentences[document] = s
 		files.append(document) # list of filenames

+	print('---------')
 	print('*txt files loaded*')
 	return files, corpus, sentences, wordlists

@ -72,21 +77,28 @@ def create_index():

 	# index = {
 	# 	Fem document : {
-	# 		'tfidf' : {
-	# 			'aap': 39.2,
-	# 			'beer': 20.456,
-	# 			'citroen': 3.21
-	# 		},
+	# 		'sentences' : [],
 	# 		'tf' : {
 	# 			'aap': 4,
 	# 			'beer': 6,
 	# 			'citroen': 2
 	# 		},
+	# 		'idf' : {
+	# 			'aap': 2,
+	# 			'beer': 1,
+	# 			'citroen': 5
+	# 		},
+	# 		'tfidf' : {
+	# 			'aap': 39.2,
+	# 			'beer': 20.456,
+	# 			'citroen': 3.21
+	# 		},
 	#		'name': 'Feminist document (2000)',
-	#		'language': 'en'
+	# 		'language': 'en'
 	# 	}
 	# }

+
 	for document in files:
 		print('---------')
 		print('document:', document)
@ -104,12 +116,15 @@ def create_index():
 			if 'tfidf' not in index[document]:
 				index[document]['tfidf'] = {}
 			index[document]['tfidf'][word] = tfidf_value
+			index[document]['language'] = get_language(document)

 		index[document]['name'] = make_human_readable_name(document)

 	with open('index.json','w+') as out:
 		out.write(json.dumps(index, indent=4, sort_keys=True))
 		out.close()
+	print('---------')
 	print('*index created*')
+	print('---------')

 # create_index()
--- a/cross-reader.tfidf/words.txt
+++ b/cross-reader.tfidf/words.txt
@ -1,2 +1,2 @@
-lorum
+lorem
 ipsum