From 1fda8c792570cc26b269dc18a6bdb59b9fb211d0 Mon Sep 17 00:00:00 2001
From: manetta <mail@manettaberends.nl>
Date: Mon, 15 Apr 2019 12:27:27 +0200
Subject: [PATCH] short round of debugging this prototype, keeping the option
 in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag
 you want in the filename of a document to let the tool search only within
 that language)

---
 cross-reader.tfidf/readings.py |  4 ++--
 cross-reader.tfidf/start.py    |  4 ++--
 cross-reader.tfidf/tfidf.py    | 29 ++++++++++++++++++++++-------
 cross-reader.tfidf/words.txt   |  2 +-
 4 files changed, 27 insertions(+), 12 deletions(-)
diff --git a/cross-reader.tfidf/readings.py b/cross-reader.tfidf/readings.py
index 0df4679..212c031 100644
--- a/cross-reader.tfidf/readings.py
+++ b/cross-reader.tfidf/readings.py
@@ -119,11 +119,11 @@ def request_results(query):
 	# 	}
 	# }
 
-	# First, sort the matching documents on TF-IDF values
+	# First, check which documents use the query 
 	order = []
 	for document, _ in index.items():
 		for key in index[document]['tfidf'].keys():
-			if query == key.lower():
+			if query == key.strip().lower():
 				print('Query match:', query)
 				match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
 				order.append(match)
diff --git a/cross-reader.tfidf/start.py b/cross-reader.tfidf/start.py
index eb1791b..4c47a32 100644
--- a/cross-reader.tfidf/start.py
+++ b/cross-reader.tfidf/start.py
@@ -49,7 +49,7 @@ def index():
 @APP.route('/document/<name>', methods=['GET', 'POST'])
 def open_document(name):
 	""" 
-		Open document.
+		Read document.
 	"""
 	index = readings.load_index()
 	suggestions = open('words.txt', 'r').readlines()
@@ -94,4 +94,4 @@ if __name__ == '__main__':
 	if not 'index.json' in os.listdir('.'):
 		tfidf.create_index()
 	APP.debug=True
-	APP.run()
\ No newline at end of file
+	APP.run(port=5001)
\ No newline at end of file
diff --git a/cross-reader.tfidf/tfidf.py b/cross-reader.tfidf/tfidf.py
index a125203..0dff8db 100644
--- a/cross-reader.tfidf/tfidf.py
+++ b/cross-reader.tfidf/tfidf.py
@@ -37,7 +37,11 @@ def tfidf(query, words, corpus):
 	return tf_count, idf_count, tfidf_value 
 
 def get_language(document):
-	language = re.search(r'\[.*\]', document, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()
+	match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
+	if match:
+		language = match.group().replace('[','').replace(']','').lower()
+	else:
+		language = 'undefined'
 	return language
 
 def load_text_files():
@@ -59,6 +63,7 @@ def load_text_files():
 		sentences[document] = s
 		files.append(document) # list of filenames
 
+	print('---------')
 	print('*txt files loaded*')
 	return files, corpus, sentences, wordlists
 
@@ -72,21 +77,28 @@ def create_index():
 
 	# index = {
 	# 	Fem document : {
-	# 		'tfidf' : {
-	# 			'aap': 39.2,
-	# 			'beer': 20.456,
-	# 			'citroen': 3.21
-	# 		},
+	# 		'sentences' : [],
 	# 		'tf' : {
 	# 			'aap': 4,
 	# 			'beer': 6,
 	# 			'citroen': 2
 	# 		},
+	# 		'idf' : {
+	# 			'aap': 2,
+	# 			'beer': 1,
+	# 			'citroen': 5
+	# 		},
+	# 		'tfidf' : {
+	# 			'aap': 39.2,
+	# 			'beer': 20.456,
+	# 			'citroen': 3.21
+	# 		},
 	#		'name': 'Feminist document (2000)',
-	#		'language': 'en'
+	# 		'language': 'en'
 	# 	}
 	# }
 
+
 	for document in files:
 		print('---------')
 		print('document:', document)
@@ -104,12 +116,15 @@ def create_index():
 			if 'tfidf' not in index[document]:
 				index[document]['tfidf'] = {}
 			index[document]['tfidf'][word] = tfidf_value
+			index[document]['language'] = get_language(document)
 
 		index[document]['name'] = make_human_readable_name(document)
 
 	with open('index.json','w+') as out:
 		out.write(json.dumps(index, indent=4, sort_keys=True))
 		out.close()
+	print('---------')
 	print('*index created*')
+	print('---------')
 
 # create_index()
\ No newline at end of file
diff --git a/cross-reader.tfidf/words.txt b/cross-reader.tfidf/words.txt
index 280aa26..7187db3 100644
--- a/cross-reader.tfidf/words.txt
+++ b/cross-reader.tfidf/words.txt
@@ -1,2 +1,2 @@
-lorum
+lorem
 ipsum