You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
190 lines
5.0 KiB
190 lines
5.0 KiB
import os, json
|
|
from math import log, exp
|
|
from flask import Markup
|
|
|
|
from nltk import sent_tokenize
|
|
from nltk.tokenize import RegexpTokenizer
|
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
|
|
|
import pprint
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
def tfidf(query, words, corpus):
|
|
# Term Frequency
|
|
tf_count = 0
|
|
for word in words:
|
|
if query == word:
|
|
tf_count += 1
|
|
tf = tf_count/len(words)
|
|
# print('count:', tf_count)
|
|
# print('total:', len(words))
|
|
# print('TF - count/total', tf_count/len(words))
|
|
|
|
# Inverse Document Frequency
|
|
idf_count = 0
|
|
for words in corpus:
|
|
if query in words:
|
|
idf_count += 1
|
|
# print('count:', idf_count)
|
|
idf = log(len(corpus)/idf_count)
|
|
# print('documents:', len(corpus))
|
|
# print('documents/count', len(corpus)/idf_count)
|
|
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
|
|
|
|
tfidf_value = tf * idf
|
|
# print('TF-IDF:', tfidf_value)
|
|
|
|
return tf_count, tf_count, tfidf_value
|
|
|
|
def load_text_files():
|
|
files = []
|
|
corpus = []
|
|
sentences = {}
|
|
dir = 'txt'
|
|
|
|
for f in sorted(os.listdir(dir)):
|
|
# manifesto = f.replace('.txt','')
|
|
manifesto = f
|
|
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
|
|
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
|
|
corpus.append(words) # all words of one manifesto, in reading order
|
|
s = sent_tokenize(lines)
|
|
sentences[manifesto] = s
|
|
files.append(manifesto) # list of filenames
|
|
|
|
print('*txt files loaded*')
|
|
return files, corpus, sentences
|
|
|
|
def create_index():
|
|
files, corpus, sentences = load_text_files()
|
|
index = {}
|
|
|
|
# index = {
|
|
# Fem manifesto : {
|
|
# 'words' : {
|
|
# 'aap': 39.2,
|
|
# 'beer': 20.456,
|
|
# 'citroen': 3.21
|
|
# }
|
|
# 'tf' : {
|
|
# 'aap': 4,
|
|
# 'beer': 6,
|
|
# 'citroen': 2
|
|
# }
|
|
# 'idf' : {
|
|
# 'aap': 4,
|
|
# 'beer': 6,
|
|
# 'citroen': 2
|
|
# }
|
|
# }
|
|
# }
|
|
|
|
for i, words in enumerate(corpus):
|
|
manifesto = files[i]
|
|
index[manifesto] = {}
|
|
index[manifesto]['sentences'] = sentences[manifesto]
|
|
for word in words:
|
|
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
|
|
if 'words' not in index[manifesto]:
|
|
index[manifesto]['words'] = {}
|
|
index[manifesto]['words'][word] = tfidf_value
|
|
if 'tf' not in index[manifesto]:
|
|
index[manifesto]['tf'] = {}
|
|
index[manifesto]['tf'][word] = tf_count
|
|
|
|
with open('index.json','w+') as out:
|
|
out.write(json.dumps(index, indent=4, sort_keys=True))
|
|
out.close()
|
|
print('*index created*')
|
|
|
|
def load_index():
|
|
f = open('index.json').read()
|
|
index = json.loads(f)
|
|
return index
|
|
|
|
def request_results(query):
|
|
query = query.strip()
|
|
f = open('index.json').read()
|
|
index = json.loads(f)
|
|
files = [manifesto for manifesto, _ in index.items()]
|
|
|
|
results = {}
|
|
|
|
# results = {
|
|
# 0 : {
|
|
# 'name' : 'Fem_manifesto',
|
|
# 'value' : 0.00041,
|
|
# 'sentences' : [
|
|
# 'This is a first sentence.',
|
|
# 'This is a second sentence.',
|
|
# 'This is a third sentence.'
|
|
# ]
|
|
# }
|
|
# }
|
|
|
|
# make a list of manifesto's that use the query word
|
|
result_matches = []
|
|
for manifesto, _ in index.items():
|
|
for word, value in index[manifesto]['words'].items():
|
|
if query == word:
|
|
tf = index[manifesto]['tf'][word]
|
|
total = len(index[manifesto]['words'])
|
|
sentences = index[manifesto]['sentences']
|
|
result_matches.append([value, manifesto, tf, total, sentences])
|
|
|
|
result_matches.sort(reverse=True)
|
|
for x, result in enumerate(result_matches):
|
|
results[x] = {}
|
|
results[x]['tfidf'] = result[0]
|
|
results[x]['name'] = result[1]
|
|
results[x]['tf'] = result[2]
|
|
results[x]['total'] = result[3]
|
|
results[x]['sentences'] = result[4]
|
|
|
|
pp.pprint(results)
|
|
|
|
# make a list of sentences that contain the query word
|
|
# and shape results object
|
|
for x, manifesto in results.items():
|
|
sents = sentences[manifesto['name']]
|
|
value = manifesto['tfidf'] * 50000
|
|
result_sentences = []
|
|
count = 0
|
|
for s in manifesto['sentences']:
|
|
done = 'no'
|
|
for word in tokenizer.tokenize(s):
|
|
if word == query:
|
|
if count < 3: # set to include a max 3 results/manifesto in the results list
|
|
count += 1
|
|
if done is not 'yes':
|
|
sentence = s.replace(query, '<strong style="font-size:{}%;">{}</strong>'.format(100 + value, query))
|
|
html = Markup(sentence)
|
|
if count == 3:
|
|
html = html + Markup('<div id="more">(...)<sup>*</sup></div>')
|
|
result_sentences.append(html)
|
|
done = 'yes'
|
|
results[x]['sentences'] = result_sentences
|
|
|
|
print('*results returned*')
|
|
return results, files
|
|
|
|
def request_ordered():
|
|
f = open('index.json').read()
|
|
index = json.loads(f)
|
|
files = [manifesto for manifesto, _ in index.items()]
|
|
results = {}
|
|
for manifesto, _ in index.items():
|
|
words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True)
|
|
results[manifesto] = words
|
|
return results, files
|
|
|
|
# def request_ordered_all():
|
|
# f = open('index.json').read()
|
|
# index = json.loads(f)
|
|
# files = [manifesto for manifesto, _ in index.items()]
|
|
# results = []
|
|
# i = 0
|
|
# for manifesto, _ in index.items():
|
|
# i += 1
|
|
# [value, word, i] for word, value in index[manifesto]['words'].items()
|
|
# return results, files
|
|
|