import os, json, re from flask import Markup import nltk from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) import tfidf # TF-IDF visualisation multiplier multiplier = 25000 def load_index(): if os.path.isfile('index.json') == False: tfidf.create_index() f = open('index.json').read() index = json.loads(f) return index def get_random(x, y): from random import randint return randint(x, y) def generate_random_rgb(): r = get_random(0, 255) g = get_random(0, 255) b = get_random(0, 255) return r, g, b def request_mappings_all(): index = load_index() filenames = [manifesto for manifesto, _ in index.items()] mappings = {} for manifesto, _ in index.items(): words = [] for sentence in index[manifesto]['sentences']: for word in tokenizer.tokenize(sentence): tfidf = index[manifesto]['tfidf'][word] * multiplier if [tfidf, word] not in words: words.append([tfidf, word]) words.sort(reverse=True) mappings[manifesto] = words # pp.pprint(mappings) return mappings def request_mappings(name): index = load_index() filenames = [manifesto for manifesto, _ in index.items()] mappings = {} for manifesto, _ in index.items(): if manifesto == name: sentences = [] for sentence in index[manifesto]['sentences']: words = [] for word in tokenizer.tokenize(sentence): tfidf = index[manifesto]['tfidf'][word] * multiplier words.append([word, tfidf]) sentences.append(words) mappings[manifesto] = sentences # pp.pprint(mappings) return mappings, filenames def insert_query_highlight(query, sentence, r, g, b): pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$' match = re.search(pattern, sentence, flags=re.IGNORECASE) if match: match = match.group() sentence = re.sub(pattern, ' {match} '.format(match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE) return sentence def insert_suggestion_links(query, sentence): # insert further reading links for suggestion in open('words.txt','r').readlines(): suggestion = suggestion.replace('\n', '').strip() if suggestion: if suggestion != query: pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$' match = re.search(pattern, sentence, flags=re.IGNORECASE) if match: match = match.group() match = match.replace(suggestion, '{0}'.format(suggestion)) sentence = re.sub(pattern, '{}'.format(match), sentence, flags=re.IGNORECASE) return sentence def generate_analytics(query, results, index): analytics = {} mappings = request_mappings_all() for manifesto, items in mappings.items(): if manifesto == results[0]['filename']: analytics['mappings'] = mappings[manifesto] # Stemmer (very similar words) analytics['stemmer'] = [] porter = nltk.PorterStemmer() basequery = porter.stem(query) for manifesto, _ in index.items(): words = index[manifesto]['tfidf'].keys() bases = [[porter.stem(word), word] for word in words] # print('Stemmer bases', bases) for base, word in bases: if base == basequery: analytics['stemmer'].append(word) analytics['stemmer'] = set(analytics['stemmer']) if query in analytics['stemmer']: analytics['stemmer'].remove(query) # print('Stemmer:', matches) print('*analytics information returned*') # pp.pprint(analytics) return analytics def request_results(query): print('\n*results request started*') query = query.strip().lower() print('Query:', query) index = load_index() filenames = [document for document, _ in index.items()] results = {} # results = { # 0 : { # 'name' : 'Feminist manifesto (2000)', # 'filename' : '2000_Feminist_manifesto', # 'tfidf' : 0.00041, # 'matches' : [ # 'This is a first matching sentence.', # 'This is a second matching sentence.', # 'This is a third matching sentence.' # ] # } # } # First, sort the matching manifestos on TF-IDF values order = [] for manifesto, _ in index.items(): for key in index[manifesto]['tfidf'].keys(): if query == key.lower(): # print('Query match:', query) match = [index[manifesto]['tfidf'][key], manifesto] order.append(match) break order.sort(reverse=True) # print('Order:', order) # Loop through the sorted matches # and add all the data that is needed # (sentences, tfidf value, manifesto name) x = 0 for tfidf, manifesto in order: # print('\n---', manifesto, '---') results[x] = {} results[x]['name'] = index[manifesto]['name'] # nicely readable name results[x]['filename'] = manifesto results[x]['tfidf'] = tfidf results[x]['matches'] = [] results[x]['html'] = [] # Generate a random RGB color for this manifesto r, g, b = generate_random_rgb() # All sentences from this manifesto sentences = index[manifesto]['sentences'] # Collect matching sentences only for sentence in sentences: for word in tokenizer.tokenize(sentence): if word.lower() == query: # Append sentence to final set of matching results results[x]['matches'].append(sentence) # print('Matching sentence:', sentence.replace('\n', ' ')) # Transform sentence into an HTML elements html = insert_query_highlight(query, sentence, r, g, b) html = insert_suggestion_links(query, html) html = Markup(html) results[x]['html'].append(html) break # Append sentence only once x += 1 # pp.pprint(results) print('\n*results returned*') # Add analytics if results.keys(): analytics = generate_analytics(query, results, index) else: analytics = False # pp.pprint(analytics) return results, filenames, analytics