import os, json from math import log, exp from flask import Markup from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def tfidf(query, manifesto, corpus): # Term Frequency tf_count = 0 for word in manifesto: if query == word: tf_count += 1 tf = tf_count/len(manifesto) # print('count:', tf_count) # print('total:', len(manifesto)) # print('TF - count/total', tf_count/len(manifesto)) # Inverse Document Frequency idf_count = 0 for words in corpus: if query in words: idf_count += 1 # print('count:', idf_count) idf = log(len(corpus)/idf_count) # print('documents:', len(corpus)) # print('documents/count', len(corpus)/idf_count) # print('IDF - log(documents/count)', log(len(corpus)/idf_count)) tfidf_value = tf * idf # print('TF-IDF:', tfidf_value) return tf_count, idf_count, tfidf_value def load_text_files(): files = [] corpus = [] sentences = {} dir = 'txt' for f in sorted(os.listdir(dir)): lines = open(dir+'/'+f, "r").read() # list of lines in .txt file words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation corpus.append(words) # all words of one manifesto, in reading order s = sent_tokenize(lines) manifesto = f.replace('.txt','') sentences[manifesto] = s files.append(manifesto) # list of filenames print('*txt files loaded*') return files, corpus, sentences def create_index(): files, corpus, sentences = load_text_files() index = {} # index = { # Fem manifesto : { # 'words' : { # 'aap': 39.2, # 'beer': 20.456, # 'citroen': 3.21 # } # } # } for i, words in enumerate(corpus): manifesto = files[i] index[manifesto] = {} for word in words: tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) if 'words' not in index[manifesto]: index[manifesto]['words'] = {} index[manifesto]['words'][word] = tfidf_value with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() print('*index created*') def load_index(): f = open('index.json').read() index = json.loads(f) return index def request_results(query): query = query.strip() files, corpus, sentences = load_text_files() f = open('index.json').read() index = json.loads(f) results = {} # results = { # 0 : { # 'name' : 'Fem_manifesto', # 'value' : 0.00041, # 'sentences' : [ # 'This is a first sentence.', # 'This is a second sentence.', # 'This is a third sentence.' # ] # } # } # make a list of manifesto's that use the query word result_matches = [] for manifesto, d in index.items(): for word, value in d['words'].items(): if query == word: result_matches.append([value, manifesto]) result_matches.sort(reverse=True) for x, result in enumerate(result_matches): results[x] = {} results[x]['tfidf'] = result[0] results[x]['name'] = result[1] # pp.pprint(results) # make a list of sentences that contain the query word # and shape results object for x, manifesto in results.items(): sents = sentences[manifesto['name']] value = manifesto['tfidf'] * 10000 result_sentences = [] count = 0 for s in sents: done = 'no' for word in tokenizer.tokenize(s): if word == query: if count < 3: # set to include a max 3 results/manifesto in the results list count += 1 if done is not 'yes': sentence = s.replace(query, '{}'.format(value, query)) html = Markup(sentence) result_sentences.append(html) done = 'yes' results[x]['sentences'] = result_sentences print('*results returned*') return results, index