import os, json, re from math import log, exp from flask import Markup from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def tfidf(query, words, corpus): # Term Frequency tf_count = 0 for word in words: if query == word: tf_count += 1 tf = tf_count/len(words) # print('count:', tf_count) # print('total:', len(words)) # print('TF - count/total', tf_count/len(words)) # Inverse Document Frequency idf_count = 0 for words in corpus: if query in words: idf_count += 1 # print('count:', idf_count) idf = log(len(corpus)/idf_count) # print('documents:', len(corpus)) # print('documents/count', len(corpus)/idf_count) # print('IDF - log(documents/count)', log(len(corpus)/idf_count)) tfidf_value = tf * idf # print('TF-IDF:', tfidf_value) return tf_count, tf_count, tfidf_value def load_text_files(): files = [] corpus = [] sentences = {} dir = 'txt' for f in sorted(os.listdir(dir)): # manifesto = f.replace('.txt','') manifesto = f lines = open(dir+'/'+f, "r").read() # list of lines in .txt file words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation corpus.append(words) # all words of one manifesto, in reading order s = sent_tokenize(lines) sentences[manifesto] = s files.append(manifesto) # list of filenames print('*txt files loaded*') return files, corpus, sentences def create_index(): files, corpus, sentences = load_text_files() index = {} # index = { # Fem manifesto : { # 'words' : { # 'aap': 39.2, # 'beer': 20.456, # 'citroen': 3.21 # } # 'tf' : { # 'aap': 4, # 'beer': 6, # 'citroen': 2 # } # 'idf' : { # 'aap': 4, # 'beer': 6, # 'citroen': 2 # } # } # } for i, words in enumerate(corpus): manifesto = files[i] index[manifesto] = {} index[manifesto]['sentences'] = sentences[manifesto] for word in words: tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) if 'words' not in index[manifesto]: index[manifesto]['words'] = {} index[manifesto]['words'][word] = tfidf_value if 'tf' not in index[manifesto]: index[manifesto]['tf'] = {} index[manifesto]['tf'][word] = tf_count with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() print('*index created*') def load_index(): f = open('index.json').read() index = json.loads(f) return index def request_results(query): query = query.strip() f = open('index.json').read() index = json.loads(f) files = [manifesto for manifesto, _ in index.items()] results = {} # results = { # 0 : { # 'name' : 'Fem_manifesto', # 'value' : 0.00041, # 'sentences' : [ # 'This is a first sentence.', # 'This is a second sentence.', # 'This is a third sentence.' # ] # } # } # make a list of manifesto's that use the query word result_matches = [] for manifesto, _ in index.items(): for word, value in index[manifesto]['words'].items(): if query == word: tf = index[manifesto]['tf'][word] total = len(index[manifesto]['words']) sentences = index[manifesto]['sentences'] result_matches.append([value, manifesto, tf, total, sentences]) result_matches.sort(reverse=True) for x, result in enumerate(result_matches): results[x] = {} results[x]['tfidf'] = result[0] results[x]['name'] = result[1] results[x]['tf'] = result[2] results[x]['total'] = result[3] results[x]['sentences'] = result[4] pp.pprint(results) # make a list of sentences that contain the query word # and shape results object for x, manifesto in results.items(): value = manifesto['tfidf'] * 50000 result_sentences = [] # count = 0 for s in manifesto['sentences']: done = 'no' for word in tokenizer.tokenize(s): if word == query: # if count < 3: # set to include a max 3 results/manifesto in the results list # count += 1 if done is not 'yes': sentence = re.sub(r'[ .,;/\\*]'+query+r'[ ,.;/\\*]', ' {} '.format(100 + value, query), s) html = Markup(sentence) # if count == 3: # html = html + Markup('
(...)*
') result_sentences.append(html) done = 'yes' results[x]['sentences'] = result_sentences print('*results returned*') return results, files def request_ordered(): f = open('index.json').read() index = json.loads(f) files = [manifesto for manifesto, _ in index.items()] results = {} for manifesto, _ in index.items(): words = sorted([[value, word] for word, value in index[manifesto]['words'].items()], reverse=True) results[manifesto] = words return results, files def request_ordered_all(): f = open('index.json').read() index = json.loads(f) files = [manifesto for manifesto, _ in index.items()] results = [] i = 0 for manifesto, _ in index.items(): for word, value in index[manifesto]['words'].items(): results.append([value, word, i]) i += 1 results = sorted(results) return results, files