This commit is contained in:
mb@mb 2018-09-10 13:52:17 +02:00
parent 59aa39c5ee
commit e6a72588ec
2 changed files with 59 additions and 17 deletions

View File

@ -15,12 +15,12 @@
<input id="submit" type="submit" value="➜"/>
</form>
</div>
<input id="frustrated" type="button" value="I'm feeling frustrated">
<div>(<a href="{{ url_for('lists')}}">lists</a>, <a href="{{ url_for('ordered')}}">ordered</a>)</div>
<div id="txt-list">
<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
<ul>
{% for txt in files %}
{% for txt in files|sort %}
<li>{{txt}}</li>
{% endfor %}
<ul>

View File

@ -9,16 +9,16 @@ tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def tfidf(query, manifesto, corpus):
def tfidf(query, words, corpus):
# Term Frequency
tf_count = 0
for word in manifesto:
for word in words:
if query == word:
tf_count += 1
tf = tf_count/len(manifesto)
tf = tf_count/len(words)
# print('count:', tf_count)
# print('total:', len(manifesto))
# print('TF - count/total', tf_count/len(manifesto))
# print('total:', len(words))
# print('TF - count/total', tf_count/len(words))
# Inverse Document Frequency
idf_count = 0
@ -34,7 +34,7 @@ def tfidf(query, manifesto, corpus):
tfidf_value = tf * idf
# print('TF-IDF:', tfidf_value)
return tf_count, idf_count, tfidf_value
return tf_count, tf_count, tfidf_value
def load_text_files():
files = []
@ -43,11 +43,12 @@ def load_text_files():
dir = 'txt'
for f in sorted(os.listdir(dir)):
# manifesto = f.replace('.txt','')
manifesto = f
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
corpus.append(words) # all words of one manifesto, in reading order
s = sent_tokenize(lines)
manifesto = f.replace('.txt','')
sentences[manifesto] = s
files.append(manifesto) # list of filenames
@ -65,17 +66,31 @@ def create_index():
# 'beer': 20.456,
# 'citroen': 3.21
# }
# 'tf' : {
# 'aap': 4,
# 'beer': 6,
# 'citroen': 2
# }
# 'idf' : {
# 'aap': 4,
# 'beer': 6,
# 'citroen': 2
# }
# }
# }
for i, words in enumerate(corpus):
manifesto = files[i]
index[manifesto] = {}
index[manifesto]['sentences'] = sentences[manifesto]
for word in words:
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
if 'words' not in index[manifesto]:
index[manifesto]['words'] = {}
index[manifesto]['words'][word] = tfidf_value
if 'tf' not in index[manifesto]:
index[manifesto]['tf'] = {}
index[manifesto]['tf'][word] = tf_count
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
@ -89,9 +104,10 @@ def load_index():
def request_results(query):
query = query.strip()
files, corpus, sentences = load_text_files()
f = open('index.json').read()
index = json.loads(f)
files = [manifesto for manifesto, _ in index.items()]
results = {}
# results = {
@ -108,27 +124,32 @@ def request_results(query):
# make a list of manifesto's that use the query word
result_matches = []
for manifesto, d in index.items():
for word, value in d['words'].items():
for manifesto, _ in index.items():
for word, value in index[manifesto]['words'].items():
if query == word:
result_matches.append([value, manifesto])
tf = index[manifesto]['tf'][word]
total = len(index[manifesto]['words'])
sentences = index[manifesto]['sentences']
result_matches.append([value, manifesto, tf, total, sentences])
result_matches.sort(reverse=True)
for x, result in enumerate(result_matches):
results[x] = {}
results[x]['tfidf'] = result[0]
results[x]['name'] = result[1]
results[x]['tf'] = result[2]
results[x]['total'] = result[3]
results[x]['sentences'] = result[4]
# pp.pprint(results)
pp.pprint(results)
# make a list of sentences that contain the query word
# and shape results object
for x, manifesto in results.items():
sents = sentences[manifesto['name']]
value = manifesto['tfidf'] * 10000
result_sentences = []
count = 0
for s in sents:
for s in manifesto['sentences']:
done = 'no'
for word in tokenizer.tokenize(s):
if word == query:
@ -144,4 +165,25 @@ def request_results(query):
results[x]['sentences'] = result_sentences
print('*results returned*')
return results, index
return results, files
def request_ordered():
f = open('index.json').read()
index = json.loads(f)
files = [manifesto for manifesto, _ in index.items()]
results = {}
for manifesto, _ in index.items():
words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True)
results[manifesto] = words
return results, files
# def request_ordered_all():
# f = open('index.json').read()
# index = json.loads(f)
# files = [manifesto for manifesto, _ in index.items()]
# results = []
# i = 0
# for manifesto, _ in index.items():
# i += 1
# [value, word, i] for word, value in index[manifesto]['words'].items()
# return results, files