merge

2018-09-10 13:52:17 +02:00 · 2018-09-10 13:52:17 +02:00 · e6a72588ec
commit e6a72588ec
parent 59aa39c5ee
2 changed files with 59 additions and 17 deletions
--- a/templates/base.html
+++ b/templates/base.html
@ -15,12 +15,12 @@
 			<input id="submit" type="submit" value="➜"/>
 		</form>
 	</div>
-	<input id="frustrated" type="button" value="I'm feeling frustrated">
+	<div>(<a href="{{ url_for('lists')}}">lists</a>, <a href="{{ url_for('ordered')}}">ordered</a>)</div>
 	<div id="txt-list">
 		<p>Searching through <em>and calculating words of</em> the following txt documents:</p>

 		<ul>
-		{% for txt in files %}
+		{% for txt in files|sort %}
 			<li>{{txt}}</li>
 		{% endfor %}
 		<ul>
--- a/tfidf.py
+++ b/tfidf.py
@ -9,16 +9,16 @@ tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
 import pprint
 pp = pprint.PrettyPrinter(indent=4)

-def tfidf(query, manifesto, corpus):
+def tfidf(query, words, corpus):
 	# Term Frequency
 	tf_count = 0
-	for word in manifesto:
+	for word in words:
 		if query == word:
 			tf_count += 1
-	tf = tf_count/len(manifesto)
+	tf = tf_count/len(words)
 	# print('count:', tf_count)
-	# print('total:', len(manifesto))
-	# print('TF - count/total', tf_count/len(manifesto))
+	# print('total:', len(words))
+	# print('TF - count/total', tf_count/len(words))

 	# Inverse Document Frequency
 	idf_count = 0
@ -34,7 +34,7 @@ def tfidf(query, manifesto, corpus):
 	tfidf_value = tf * idf
 	# print('TF-IDF:', tfidf_value)

-	return tf_count, idf_count, tfidf_value 
+	return tf_count, tf_count, tfidf_value 

 def load_text_files():
 	files = []
@ -43,11 +43,12 @@ def load_text_files():
 	dir = 'txt'

 	for f in sorted(os.listdir(dir)):
+		# manifesto = f.replace('.txt','')
+		manifesto = f
 		lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
 		words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
 		corpus.append(words) # all words of one manifesto, in reading order
 		s = sent_tokenize(lines)
-		manifesto = f.replace('.txt','')
 		sentences[manifesto] = s
 		files.append(manifesto) # list of filenames

@ -65,17 +66,31 @@ def create_index():
 	# 			'beer': 20.456,
 	# 			'citroen': 3.21
 	# 		}
+	# 		'tf' : {
+	# 			'aap': 4,
+	# 			'beer': 6,
+	# 			'citroen': 2
+	# 		}
+	# 		'idf' : {
+	# 			'aap': 4,
+	# 			'beer': 6,
+	# 			'citroen': 2
+	# 		}
 	# 	}
 	# }

 	for i, words in enumerate(corpus):
 		manifesto = files[i]
 		index[manifesto] = {}
+		index[manifesto]['sentences'] = sentences[manifesto]
 		for word in words:
 			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
 			if 'words' not in index[manifesto]:
 				index[manifesto]['words'] = {}
 			index[manifesto]['words'][word] = tfidf_value
+			if 'tf' not in index[manifesto]:
+				index[manifesto]['tf'] = {}
+			index[manifesto]['tf'][word] = tf_count
 	
 	with open('index.json','w+') as out:
 		out.write(json.dumps(index, indent=4, sort_keys=True))
@ -89,9 +104,10 @@ def load_index():

 def request_results(query):
 	query = query.strip()
-	files, corpus, sentences = load_text_files()
 	f = open('index.json').read()
 	index = json.loads(f)
+	files = [manifesto for manifesto, _ in index.items()]
+
 	results = {}

 	# results = {
@ -108,27 +124,32 @@ def request_results(query):

 	# make a list of manifesto's that use the query word
 	result_matches = []
-	for manifesto, d in index.items():
-		for word, value in d['words'].items():
+	for manifesto, _ in index.items():
+		for word, value in index[manifesto]['words'].items():
 			if query == word:
-				result_matches.append([value, manifesto])
+				tf = index[manifesto]['tf'][word]
+				total = len(index[manifesto]['words'])
+				sentences = index[manifesto]['sentences']
+				result_matches.append([value, manifesto, tf, total, sentences])

 	result_matches.sort(reverse=True)
 	for x, result in enumerate(result_matches):
 		results[x] = {}
 		results[x]['tfidf'] = result[0]
 		results[x]['name'] = result[1]
+		results[x]['tf'] = result[2]
+		results[x]['total'] = result[3]
+		results[x]['sentences'] = result[4]

-	# pp.pprint(results)
+	pp.pprint(results)

 	# make a list of sentences that contain the query word
 	# and shape results object
 	for x, manifesto in results.items():
-		sents = sentences[manifesto['name']]
 		value = manifesto['tfidf'] * 10000
 		result_sentences = []
 		count = 0
-		for s in sents:
+		for s in manifesto['sentences']:
 			done = 'no'
 			for word in tokenizer.tokenize(s):
 				if word == query:
@ -144,4 +165,25 @@ def request_results(query):
 		results[x]['sentences'] = result_sentences

 	print('*results returned*')
-	return results, index
+	return results, files
+
+def request_ordered():
+	f = open('index.json').read()
+	index = json.loads(f)
+	files = [manifesto for manifesto, _ in index.items()]
+	results = {}
+	for manifesto, _ in index.items():
+		words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True)
+		results[manifesto] = words
+	return results, files
+
+# def request_ordered_all():
+# 	f = open('index.json').read()
+# 	index = json.loads(f)
+# 	files = [manifesto for manifesto, _ in index.items()]
+# 	results = []
+# 	i = 0
+# 	for manifesto, _ in index.items():
+# 		i += 1
+# 		[value, word, i] for word, value in index[manifesto]['words'].items()
+# 	return results, files