|
|
@ -9,16 +9,16 @@ tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer |
|
|
|
import pprint |
|
|
|
pp = pprint.PrettyPrinter(indent=4) |
|
|
|
|
|
|
|
def tfidf(query, manifesto, corpus): |
|
|
|
def tfidf(query, words, corpus): |
|
|
|
# Term Frequency |
|
|
|
tf_count = 0 |
|
|
|
for word in manifesto: |
|
|
|
for word in words: |
|
|
|
if query == word: |
|
|
|
tf_count += 1 |
|
|
|
tf = tf_count/len(manifesto) |
|
|
|
tf = tf_count/len(words) |
|
|
|
# print('count:', tf_count) |
|
|
|
# print('total:', len(manifesto)) |
|
|
|
# print('TF - count/total', tf_count/len(manifesto)) |
|
|
|
# print('total:', len(words)) |
|
|
|
# print('TF - count/total', tf_count/len(words)) |
|
|
|
|
|
|
|
# Inverse Document Frequency |
|
|
|
idf_count = 0 |
|
|
@ -34,7 +34,7 @@ def tfidf(query, manifesto, corpus): |
|
|
|
tfidf_value = tf * idf |
|
|
|
# print('TF-IDF:', tfidf_value) |
|
|
|
|
|
|
|
return tf_count, idf_count, tfidf_value |
|
|
|
return tf_count, tf_count, tfidf_value |
|
|
|
|
|
|
|
def load_text_files(): |
|
|
|
files = [] |
|
|
@ -43,11 +43,12 @@ def load_text_files(): |
|
|
|
dir = 'txt' |
|
|
|
|
|
|
|
for f in sorted(os.listdir(dir)): |
|
|
|
# manifesto = f.replace('.txt','') |
|
|
|
manifesto = f |
|
|
|
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file |
|
|
|
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation |
|
|
|
corpus.append(words) # all words of one manifesto, in reading order |
|
|
|
s = sent_tokenize(lines) |
|
|
|
manifesto = f.replace('.txt','') |
|
|
|
sentences[manifesto] = s |
|
|
|
files.append(manifesto) # list of filenames |
|
|
|
|
|
|
@ -65,17 +66,31 @@ def create_index(): |
|
|
|
# 'beer': 20.456, |
|
|
|
# 'citroen': 3.21 |
|
|
|
# } |
|
|
|
# 'tf' : { |
|
|
|
# 'aap': 4, |
|
|
|
# 'beer': 6, |
|
|
|
# 'citroen': 2 |
|
|
|
# } |
|
|
|
# 'idf' : { |
|
|
|
# 'aap': 4, |
|
|
|
# 'beer': 6, |
|
|
|
# 'citroen': 2 |
|
|
|
# } |
|
|
|
# } |
|
|
|
# } |
|
|
|
|
|
|
|
for i, words in enumerate(corpus): |
|
|
|
manifesto = files[i] |
|
|
|
index[manifesto] = {} |
|
|
|
index[manifesto]['sentences'] = sentences[manifesto] |
|
|
|
for word in words: |
|
|
|
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) |
|
|
|
if 'words' not in index[manifesto]: |
|
|
|
index[manifesto]['words'] = {} |
|
|
|
index[manifesto]['words'][word] = tfidf_value |
|
|
|
if 'tf' not in index[manifesto]: |
|
|
|
index[manifesto]['tf'] = {} |
|
|
|
index[manifesto]['tf'][word] = tf_count |
|
|
|
|
|
|
|
with open('index.json','w+') as out: |
|
|
|
out.write(json.dumps(index, indent=4, sort_keys=True)) |
|
|
@ -89,9 +104,10 @@ def load_index(): |
|
|
|
|
|
|
|
def request_results(query): |
|
|
|
query = query.strip() |
|
|
|
files, corpus, sentences = load_text_files() |
|
|
|
f = open('index.json').read() |
|
|
|
index = json.loads(f) |
|
|
|
files = [manifesto for manifesto, _ in index.items()] |
|
|
|
|
|
|
|
results = {} |
|
|
|
|
|
|
|
# results = { |
|
|
@ -108,27 +124,32 @@ def request_results(query): |
|
|
|
|
|
|
|
# make a list of manifesto's that use the query word |
|
|
|
result_matches = [] |
|
|
|
for manifesto, d in index.items(): |
|
|
|
for word, value in d['words'].items(): |
|
|
|
for manifesto, _ in index.items(): |
|
|
|
for word, value in index[manifesto]['words'].items(): |
|
|
|
if query == word: |
|
|
|
result_matches.append([value, manifesto]) |
|
|
|
tf = index[manifesto]['tf'][word] |
|
|
|
total = len(index[manifesto]['words']) |
|
|
|
sentences = index[manifesto]['sentences'] |
|
|
|
result_matches.append([value, manifesto, tf, total, sentences]) |
|
|
|
|
|
|
|
result_matches.sort(reverse=True) |
|
|
|
for x, result in enumerate(result_matches): |
|
|
|
results[x] = {} |
|
|
|
results[x]['tfidf'] = result[0] |
|
|
|
results[x]['name'] = result[1] |
|
|
|
results[x]['tf'] = result[2] |
|
|
|
results[x]['total'] = result[3] |
|
|
|
results[x]['sentences'] = result[4] |
|
|
|
|
|
|
|
# pp.pprint(results) |
|
|
|
pp.pprint(results) |
|
|
|
|
|
|
|
# make a list of sentences that contain the query word |
|
|
|
# and shape results object |
|
|
|
for x, manifesto in results.items(): |
|
|
|
sents = sentences[manifesto['name']] |
|
|
|
value = manifesto['tfidf'] * 10000 |
|
|
|
result_sentences = [] |
|
|
|
count = 0 |
|
|
|
for s in sents: |
|
|
|
for s in manifesto['sentences']: |
|
|
|
done = 'no' |
|
|
|
for word in tokenizer.tokenize(s): |
|
|
|
if word == query: |
|
|
@ -144,4 +165,25 @@ def request_results(query): |
|
|
|
results[x]['sentences'] = result_sentences |
|
|
|
|
|
|
|
print('*results returned*') |
|
|
|
return results, index |
|
|
|
return results, files |
|
|
|
|
|
|
|
def request_ordered(): |
|
|
|
f = open('index.json').read() |
|
|
|
index = json.loads(f) |
|
|
|
files = [manifesto for manifesto, _ in index.items()] |
|
|
|
results = {} |
|
|
|
for manifesto, _ in index.items(): |
|
|
|
words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True) |
|
|
|
results[manifesto] = words |
|
|
|
return results, files |
|
|
|
|
|
|
|
# def request_ordered_all(): |
|
|
|
# f = open('index.json').read() |
|
|
|
# index = json.loads(f) |
|
|
|
# files = [manifesto for manifesto, _ in index.items()] |
|
|
|
# results = [] |
|
|
|
# i = 0 |
|
|
|
# for manifesto, _ in index.items(): |
|
|
|
# i += 1 |
|
|
|
# [value, word, i] for word, value in index[manifesto]['words'].items() |
|
|
|
# return results, files |
|
|
|