import os, json, re from flask import Markup import nltk from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) import tfidf # TF-IDF visualisation multiplier multiplier = 25000 def load_index(): if os.path.isfile('index.json') == False: tfidf.create_index() f = open('index.json').read() index = json.loads(f) return index def get_random(x, y): from random import randint return randint(x, y) def generate_random_rgb(): r = get_random(0, 255) g = get_random(0, 255) b = get_random(0, 255) return r, g, b def request_mappings_all(): index = load_index() filenames = [manifesto for manifesto, _ in index.items()] mappings = {} for manifesto, _ in index.items(): words = [] for sentence in index[manifesto]['sentences']: for word in tokenizer.tokenize(sentence): tfidf = index[manifesto]['tfidf'][word] * multiplier if [tfidf, word] not in words: words.append([tfidf, word]) words.sort(reverse=True) mappings[manifesto] = words return mappings def request_mappings(name): index = load_index() filenames = [manifesto for manifesto, _ in index.items()] mappings = {} for manifesto, _ in index.items(): if manifesto == name: sentences = [] for sentence in index[manifesto]['sentences']: words = [] for word in tokenizer.tokenize(sentence): tfidf = index[manifesto]['tfidf'][word] * multiplier words.append([word, tfidf]) sentences.append(words) mappings[manifesto] = sentences return mappings, filenames def insert_query_highlight(query, sentence, r, g, b): pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$' match = re.search(pattern, sentence, flags=re.IGNORECASE) if match: match = match.group() sentence = re.sub(pattern, ' {match} '.format(match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE) return sentence def generate_analytics(query, results, index): analytics = {} if results: manifesto_of_first_result = results[0]['filename'] tfidf_results = index[manifesto_of_first_result]['tfidf'] analytics['suggestions'] = sorted(tfidf_results.items(), key=lambda kv: kv[1], reverse=True) # Stemmer (very similar words) analytics['stemmer'] = [] porter = nltk.PorterStemmer() basequery = porter.stem(query) for manifesto, _ in index.items(): words = index[manifesto]['tfidf'].keys() bases = [[porter.stem(word), word] for word in words] for base, word in bases: if base == basequery: analytics['stemmer'].append(word) analytics['stemmer'] = set(analytics['stemmer']) if query in analytics['stemmer']: analytics['stemmer'].remove(query) print('*analytics information returned*') return analytics def request_results(query): query = query.strip().lower() print('Query:', query) print('\n*results request started*') index = load_index() filenames = [document for document, _ in index.items()] results = {} # results = { # 0 : { # 'name' : 'Feminist manifesto (2000)', # 'filename' : '2000_Feminist_manifesto', # 'tfidf' : 0.00041, # 'matches' : [ # 'This is a first matching sentence.', # 'This is a second matching sentence.', # 'This is a third matching sentence.' # ] # } # } # First, sort the matching manifestos on TF-IDF values order = [] for manifesto, _ in index.items(): for key in index[manifesto]['tfidf'].keys(): if query == key.lower(): # print('Query match:', query) match = [index[manifesto]['tfidf'][key], manifesto] order.append(match) break order.sort(reverse=True) # Loop through the sorted matches # and add all the data that is needed # (sentences, tfidf value, manifesto name) x = 0 for tfidf, manifesto in order: results[x] = {} results[x]['name'] = index[manifesto]['name'] # nicely readable name results[x]['filename'] = manifesto results[x]['tfidf'] = tfidf results[x]['matches'] = [] results[x]['html'] = [] # Generate a random RGB color for this manifesto r, g, b = generate_random_rgb() # All sentences from this manifesto sentences = index[manifesto]['sentences'] # Collect matching sentences only for sentence in sentences: for word in tokenizer.tokenize(sentence): if word.lower() == query: # Append sentence to final set of matching results results[x]['matches'].append(sentence) # Transform sentence into an HTML elements html = insert_query_highlight(query, sentence, r, g, b) html = Markup(html) results[x]['html'].append(html) break # Append sentence only once x += 1 # Add analytics analytics = generate_analytics(query, results, index) return results, filenames, analytics if __name__ == '__main__': request_results('personal')