import os, json, re from flask import Markup import nltk from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) import tfidf # TF-IDF visualisation multiplier multiplier = 25000 def load_index(): if os.path.isfile('index.json') == False: tfidf.create_index() f = open('index.json').read() index = json.loads(f) return index def get_random(x, y): from random import randint return randint(x, y) def generate_random_rgb(): r = get_random(0, 255) g = get_random(0, 255) b = get_random(0, 255) return r, g, b def insert_query_highlight(query, tfidf, sentence, r, g, b): pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$' match = re.search(pattern, sentence, flags=re.IGNORECASE) if match: match = match.group().replace(' ', '') sentence = re.sub(pattern, ' {match} '.format(tfidf=tfidf, match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE) return sentence def insert_suggestion_links(query, sentence): # insert further reading links for suggestion in open('words.txt','r').readlines(): suggestion = suggestion.replace('\n', '').strip() if suggestion: if suggestion != query: pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$' match = re.search(pattern, sentence, flags=re.IGNORECASE) if match: match = match.group() match = match.replace(suggestion, '{0}'.format(suggestion)) sentence = re.sub(pattern, ' {} '.format(match), sentence, flags=re.IGNORECASE) return sentence def get_adjectives(): index = load_index() sentences_all = [sentences for sentences in document['sentences'] for document, _ in index.items()] adjectives = [] for sentences in sentences_all: for sentence in sentences: pos = nltk.pos_tag(words) return adjectives def generate_analytics(query, results, index): analytics = {} querypos = nltk.pos_tag([query]) analytics['type'] = querypos[0][1] # Contrast-mapping # analytics['mapping'] = [] # if results[0]['matches']: # for word in tokenizer.tokenize(results[0]['matches'][0]): # document = results[0]['filename'] # analytics['mapping'].append([word, index[document]['tfidf'][word.lower()] * multiplier]) # lowercased! (!important) # Stemmer (very similar words) analytics['stemmer'] = [] porter = nltk.PorterStemmer() basequery = porter.stem(query) for document, _ in index.items(): words = index[document]['tfidf'].keys() bases = [[porter.stem(word), word] for word in words] # print('Stemmer bases', bases) for base, word in bases: if base == basequery: analytics['stemmer'].append(word) analytics['stemmer'] = set(analytics['stemmer']) if query in analytics['stemmer']: analytics['stemmer'].remove(query) # print('Stemmer:', matches) print('*analytics information returned*') # pp.pprint(analytics) return analytics def request_results(query): print('*results request started*') query = query.strip().lower() print('Query:', query) index = load_index() filenames = [document for document, _ in index.items()] results = {} # results = { # 0 : { # 'name' : 'Feminist document (2000)', # 'filename' : '2000_Feminist_document', # 'tfidf' : 0.00041, # 'matches' : [ # 'This is a first matching sentence.', # 'This is a second matching sentence.', # 'This is a third matching sentence.' # ] # } # } # First, check which documents use the query order = [] for document, _ in index.items(): for key in index[document]['tfidf'].keys(): if query == key.strip().lower(): print('Query match:', query) match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important) order.append(match) break order.sort(reverse=True) print('Order:', order) # Loop through the sorted matches # and add all the data that is needed # (sentences, tfidf value, document name) x = 0 for tfidf, document in order: # print('document:', document) results[x] = {} results[x]['name'] = index[document]['name'] # nicely readable name results[x]['filename'] = document results[x]['tfidf'] = tfidf results[x]['matches'] = [] results[x]['html'] = [] # Generate a random RGB color for this document r, g, b = generate_random_rgb() # All sentences from this document sentences = index[document]['sentences'] # Collect matching sentences only for sentence in sentences: for word in tokenizer.tokenize(sentence): if word.lower() == query: # Append sentence to final set of matching results results[x]['matches'].append(sentence) # Transform sentence into an HTML elements html = insert_query_highlight(query.strip(), 100 + (tfidf * multiplier), sentence, r, g, b) html = insert_suggestion_links(query, html) html = Markup(html) results[x]['html'].append(html) break # Append sentence only once x += 1 pp.pprint(results) print('*results returned*') # Add analytics if results.keys(): analytics = generate_analytics(query, results, index) else: analytics = False # pp.pprint(analytics) return filenames, results, analytics def request_mappings(mapping_type): index = load_index() filenames = [document for document, _ in index.items()] mappings = [] for document, _ in index.items(): sentences = [] for sentence in index[document]['sentences']: for word in tokenizer.tokenize(sentence): if mapping_type == 'tfidf' or mapping_type == 'tfidf-mapping': tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important) if [tfidf, word.lower()] not in mappings: # lowercased! (!important) mappings.append([tfidf, word.lower()]) # lowercased! (!important) # if mapping_type == 'tf': # tf = index[document]['tf'][word.lower()] # lowercased! (!important) # if [tf, word.lower()] not in mappings: # lowercased! (!important) # mappings.append([tf, word.lower()]) # lowercased! (!important) if mapping_type == 'idf': idf = index[document]['idf'][word.lower()] # lowercased! (!important) if [idf, word.lower()] not in mappings: # lowercased! (!important) mappings.append([idf, word.lower()]) # lowercased! (!important) mappings.sort(reverse=True) return mappings, filenames def request_mappings_for_document(name): index = load_index() filenames = [document for document, _ in index.items()] mappings = {} for document, _ in index.items(): if document == name: sentences = [] for sentence in index[document]['sentences']: words = [] for word in tokenizer.tokenize(sentence): tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important) words.append([word, tfidf]) sentences.append(words) mappings[document] = sentences # pp.pprint(mappings) return mappings, filenames