merge
This commit is contained in:
parent
59aa39c5ee
commit
e6a72588ec
@ -15,12 +15,12 @@
|
||||
<input id="submit" type="submit" value="➜"/>
|
||||
</form>
|
||||
</div>
|
||||
<input id="frustrated" type="button" value="I'm feeling frustrated">
|
||||
<div>(<a href="{{ url_for('lists')}}">lists</a>, <a href="{{ url_for('ordered')}}">ordered</a>)</div>
|
||||
<div id="txt-list">
|
||||
<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
|
||||
|
||||
<ul>
|
||||
{% for txt in files %}
|
||||
{% for txt in files|sort %}
|
||||
<li>{{txt}}</li>
|
||||
{% endfor %}
|
||||
<ul>
|
||||
|
72
tfidf.py
72
tfidf.py
@ -9,16 +9,16 @@ tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
||||
import pprint
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
|
||||
def tfidf(query, manifesto, corpus):
|
||||
def tfidf(query, words, corpus):
|
||||
# Term Frequency
|
||||
tf_count = 0
|
||||
for word in manifesto:
|
||||
for word in words:
|
||||
if query == word:
|
||||
tf_count += 1
|
||||
tf = tf_count/len(manifesto)
|
||||
tf = tf_count/len(words)
|
||||
# print('count:', tf_count)
|
||||
# print('total:', len(manifesto))
|
||||
# print('TF - count/total', tf_count/len(manifesto))
|
||||
# print('total:', len(words))
|
||||
# print('TF - count/total', tf_count/len(words))
|
||||
|
||||
# Inverse Document Frequency
|
||||
idf_count = 0
|
||||
@ -34,7 +34,7 @@ def tfidf(query, manifesto, corpus):
|
||||
tfidf_value = tf * idf
|
||||
# print('TF-IDF:', tfidf_value)
|
||||
|
||||
return tf_count, idf_count, tfidf_value
|
||||
return tf_count, tf_count, tfidf_value
|
||||
|
||||
def load_text_files():
|
||||
files = []
|
||||
@ -43,11 +43,12 @@ def load_text_files():
|
||||
dir = 'txt'
|
||||
|
||||
for f in sorted(os.listdir(dir)):
|
||||
# manifesto = f.replace('.txt','')
|
||||
manifesto = f
|
||||
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
|
||||
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
|
||||
corpus.append(words) # all words of one manifesto, in reading order
|
||||
s = sent_tokenize(lines)
|
||||
manifesto = f.replace('.txt','')
|
||||
sentences[manifesto] = s
|
||||
files.append(manifesto) # list of filenames
|
||||
|
||||
@ -65,17 +66,31 @@ def create_index():
|
||||
# 'beer': 20.456,
|
||||
# 'citroen': 3.21
|
||||
# }
|
||||
# 'tf' : {
|
||||
# 'aap': 4,
|
||||
# 'beer': 6,
|
||||
# 'citroen': 2
|
||||
# }
|
||||
# 'idf' : {
|
||||
# 'aap': 4,
|
||||
# 'beer': 6,
|
||||
# 'citroen': 2
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
for i, words in enumerate(corpus):
|
||||
manifesto = files[i]
|
||||
index[manifesto] = {}
|
||||
index[manifesto]['sentences'] = sentences[manifesto]
|
||||
for word in words:
|
||||
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
|
||||
if 'words' not in index[manifesto]:
|
||||
index[manifesto]['words'] = {}
|
||||
index[manifesto]['words'][word] = tfidf_value
|
||||
if 'tf' not in index[manifesto]:
|
||||
index[manifesto]['tf'] = {}
|
||||
index[manifesto]['tf'][word] = tf_count
|
||||
|
||||
with open('index.json','w+') as out:
|
||||
out.write(json.dumps(index, indent=4, sort_keys=True))
|
||||
@ -89,9 +104,10 @@ def load_index():
|
||||
|
||||
def request_results(query):
|
||||
query = query.strip()
|
||||
files, corpus, sentences = load_text_files()
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
|
||||
results = {}
|
||||
|
||||
# results = {
|
||||
@ -108,27 +124,32 @@ def request_results(query):
|
||||
|
||||
# make a list of manifesto's that use the query word
|
||||
result_matches = []
|
||||
for manifesto, d in index.items():
|
||||
for word, value in d['words'].items():
|
||||
for manifesto, _ in index.items():
|
||||
for word, value in index[manifesto]['words'].items():
|
||||
if query == word:
|
||||
result_matches.append([value, manifesto])
|
||||
tf = index[manifesto]['tf'][word]
|
||||
total = len(index[manifesto]['words'])
|
||||
sentences = index[manifesto]['sentences']
|
||||
result_matches.append([value, manifesto, tf, total, sentences])
|
||||
|
||||
result_matches.sort(reverse=True)
|
||||
for x, result in enumerate(result_matches):
|
||||
results[x] = {}
|
||||
results[x]['tfidf'] = result[0]
|
||||
results[x]['name'] = result[1]
|
||||
results[x]['tf'] = result[2]
|
||||
results[x]['total'] = result[3]
|
||||
results[x]['sentences'] = result[4]
|
||||
|
||||
# pp.pprint(results)
|
||||
pp.pprint(results)
|
||||
|
||||
# make a list of sentences that contain the query word
|
||||
# and shape results object
|
||||
for x, manifesto in results.items():
|
||||
sents = sentences[manifesto['name']]
|
||||
value = manifesto['tfidf'] * 10000
|
||||
result_sentences = []
|
||||
count = 0
|
||||
for s in sents:
|
||||
for s in manifesto['sentences']:
|
||||
done = 'no'
|
||||
for word in tokenizer.tokenize(s):
|
||||
if word == query:
|
||||
@ -144,4 +165,25 @@ def request_results(query):
|
||||
results[x]['sentences'] = result_sentences
|
||||
|
||||
print('*results returned*')
|
||||
return results, index
|
||||
return results, files
|
||||
|
||||
def request_ordered():
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
results = {}
|
||||
for manifesto, _ in index.items():
|
||||
words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True)
|
||||
results[manifesto] = words
|
||||
return results, files
|
||||
|
||||
# def request_ordered_all():
|
||||
# f = open('index.json').read()
|
||||
# index = json.loads(f)
|
||||
# files = [manifesto for manifesto, _ in index.items()]
|
||||
# results = []
|
||||
# i = 0
|
||||
# for manifesto, _ in index.items():
|
||||
# i += 1
|
||||
# [value, word, i] for word, value in index[manifesto]['words'].items()
|
||||
# return results, files
|
||||
|
Loading…
Reference in New Issue
Block a user