a flask exercise and search machine prototype
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

145 lines
3.7 KiB

import os, json
from math import log, exp
from flask import Markup
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def tfidf(query, manifesto, corpus):
# Term Frequency
tf_count = 0
for word in manifesto:
if query == word:
tf_count += 1
tf = tf_count/len(manifesto)
# print('count:', tf_count)
# print('total:', len(manifesto))
# print('TF - count/total', tf_count/len(manifesto))
# Inverse Document Frequency
idf_count = 0
for words in corpus:
if query in words:
idf_count += 1
# print('count:', idf_count)
idf = log(len(corpus)/idf_count)
# print('documents:', len(corpus))
# print('documents/count', len(corpus)/idf_count)
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
tfidf_value = tf * idf
# print('TF-IDF:', tfidf_value)
return tf_count, idf_count, tfidf_value
def load_text_files():
files = []
corpus = []
sentences = {}
dir = 'txt'
for f in sorted(os.listdir(dir)):
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
corpus.append(words) # all words of one manifesto, in reading order
s = sent_tokenize(lines)
manifesto = f.replace('.txt','')
sentences[manifesto] = s
files.append(manifesto) # list of filenames
print('*txt files loaded*')
return files, corpus, sentences
def create_index():
files, corpus, sentences = load_text_files()
index = {}
# index = {
# Fem manifesto : {
# 'words' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# }
# }
# }
for i, words in enumerate(corpus):
manifesto = files[i]
index[manifesto] = {}
for word in words:
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
if 'words' not in index[manifesto]:
index[manifesto]['words'] = {}
index[manifesto]['words'][word] = tfidf_value
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
out.close()
print('*index created*')
def load_index():
f = open('index.json').read()
index = json.loads(f)
return index
def request_results(query):
query = query.strip()
files, corpus, sentences = load_text_files()
f = open('index.json').read()
index = json.loads(f)
results = {}
# results = {
# 0 : {
# 'name' : 'Fem_manifesto',
# 'value' : 0.00041,
# 'sentences' : [
# 'This is a first sentence.',
# 'This is a second sentence.',
# 'This is a third sentence.'
# ]
# }
# }
# make a list of manifesto's that use the query word
result_matches = []
for manifesto, d in index.items():
for word, value in d['words'].items():
if query == word:
result_matches.append([value, manifesto])
result_matches.sort(reverse=True)
for x, result in enumerate(result_matches):
results[x] = {}
results[x]['tfidf'] = result[0]
results[x]['name'] = result[1]
# pp.pprint(results)
# make a list of sentences that contain the query word
# and shape results object
for x, manifesto in results.items():
sents = sentences[manifesto['name']]
value = manifesto['tfidf'] * 10000
result_sentences = []
count = 0
for s in sents:
done = 'no'
for word in tokenizer.tokenize(s):
if word == query:
if count < 3: # set to include a max 3 results/manifesto in the results list
count += 1
if done is not 'yes':
sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query))
html = Markup(sentence)
result_sentences.append(html)
done = 'yes'
results[x]['sentences'] = result_sentences
print('*results returned*')
return results, index