cross-readers/cross-reader.tfidf/readings.py


								import os, json, re

								from flask import Markup


								import nltk

								from nltk import sent_tokenize

								from nltk.tokenize import RegexpTokenizer

								tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer


								import pprint

								pp = pprint.PrettyPrinter(indent=4)


								import tfidf


								# TF-IDF visualisation multiplier

								multiplier = 25000


								def load_index():

									if os.path.isfile('index.json') == False:

										tfidf.create_index()

									f = open('index.json').read()

									index = json.loads(f)

									return index


								def get_random(x, y):

									from random import randint

									return randint(x, y)


								def generate_random_rgb():

									r = get_random(0, 255)

									g = get_random(0, 255)

									b = get_random(0, 255)

									return r, g, b


								def insert_query_highlight(query, tfidf, sentence, r, g, b):

									pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$'

									match = re.search(pattern, sentence, flags=re.IGNORECASE)

									if match:

										match = match.group().replace(' ', '')

									sentence = re.sub(pattern, ' <strong class="query" style="font-size:{tfidf}%;color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(tfidf=tfidf, match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)

									return sentence


								def insert_suggestion_links(query, sentence):

									# insert further reading links

									for suggestion in open('words.txt','r').readlines():

										suggestion = suggestion.replace('\n', '').strip()

										if suggestion:

											if suggestion != query:

												pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$'

												match = re.search(pattern, sentence, flags=re.IGNORECASE)

												if match:

													match = match.group()

													match = match.replace(suggestion, '<a href="?q={0}">{0}</a>'.format(suggestion))

													sentence = re.sub(pattern, ' <strong>{}</strong> '.format(match), sentence, flags=re.IGNORECASE)

									return sentence


								def get_adjectives():

									index = load_index()

									sentences_all = [sentences for sentences in document['sentences'] for document, _ in index.items()]

									adjectives = []

									for sentences in sentences_all:

										for sentence in sentences:

											pos = nltk.pos_tag(words)


									return adjectives


								def generate_analytics(query, results, index):

									analytics = {}

									querypos = nltk.pos_tag([query])

									analytics['type'] = querypos[0][1]


									# Contrast-mapping

									# analytics['mapping'] = []

									# if results[0]['matches']:

										# for word in tokenizer.tokenize(results[0]['matches'][0]):

											# document = results[0]['filename']

											# analytics['mapping'].append([word, index[document]['tfidf'][word.lower()] * multiplier]) # lowercased! (!important)


									# Stemmer (very similar words)

									analytics['stemmer'] = []

									porter = nltk.PorterStemmer()

									basequery = porter.stem(query)

									for document, _ in index.items():

										words = index[document]['tfidf'].keys()

										bases = [[porter.stem(word), word] for word in words]

										# print('Stemmer bases', bases)

										for base, word in bases:

											if base == basequery:

												analytics['stemmer'].append(word)

									analytics['stemmer'] = set(analytics['stemmer'])

									if query in analytics['stemmer']:

										analytics['stemmer'].remove(query)

									# print('Stemmer:', matches)


									print('*analytics information returned*')

									# pp.pprint(analytics)

									return analytics


								def request_results(query):

									print('*results request started*')

									query = query.strip().lower()

									print('Query:', query)

									index = load_index()

									filenames = [document for document, _ in index.items()]


									results = {}


									# results = {

									# 	0 : {

									#       'name' : 'Feminist document (2000)',

									#       'filename' : '2000_Feminist_document',

									# 		'tfidf' : 0.00041,

									#		'matches' : [

									#			'This is a first matching sentence.',

									# 			'This is a second matching sentence.',

									# 			'This is a third matching sentence.'

									#		]

									# 	}

									# }


									# First, check which documents use the query

									order = []

									for document, _ in index.items():

										for key in index[document]['tfidf'].keys():

											if query == key.strip().lower():

												print('Query match:', query)

												match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)

												order.append(match)

												break

									order.sort(reverse=True)

									print('Order:', order)


									# Loop through the sorted matches

									# and add all the data that is needed

									# (sentences, tfidf value, document name)

									x = 0

									for tfidf, document in order:

										# print('document:', document)

										results[x] = {}

										results[x]['name'] = index[document]['name'] # nicely readable name

										results[x]['filename'] = document

										results[x]['tfidf'] = tfidf

										results[x]['matches'] = []

										results[x]['html'] = []


										# Generate a random RGB color for this document

										r, g, b = generate_random_rgb()


										# All sentences from this document

										sentences = index[document]['sentences']


										# Collect matching sentences only

										for sentence in sentences:

											for word in tokenizer.tokenize(sentence):

												if word.lower() == query:


													# Append sentence to final set of matching results

													results[x]['matches'].append(sentence)


													# Transform sentence into an HTML elements

													html = insert_query_highlight(query.strip(), 100 + (tfidf * multiplier), sentence, r, g, b)

													html = insert_suggestion_links(query, html)

													html = Markup(html)

													results[x]['html'].append(html)


													break # Append sentence only once

										x += 1


									pp.pprint(results)

									print('*results returned*')


									# Add analytics

									if results.keys():

										analytics = generate_analytics(query, results, index)

									else:

										analytics = False

									# pp.pprint(analytics)


									return filenames, results, analytics


								def request_mappings(mapping_type):

									index = load_index()

									filenames = [document for document, _ in index.items()]

									mappings = []

									for document, _ in index.items():

										sentences = []

										for sentence in index[document]['sentences']:

											for word in tokenizer.tokenize(sentence):

												if mapping_type == 'tfidf' or mapping_type == 'tfidf-mapping':

													tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)

													if [tfidf, word.lower()] not in mappings: # lowercased! (!important)

														mappings.append([tfidf, word.lower()]) # lowercased! (!important)

												# if mapping_type == 'tf':

												# 	tf = index[document]['tf'][word.lower()] # lowercased! (!important)

												# 	if [tf, word.lower()] not in mappings: # lowercased! (!important)

												# 		mappings.append([tf, word.lower()]) # lowercased! (!important)

												if mapping_type == 'idf':

													idf = index[document]['idf'][word.lower()] # lowercased! (!important)

													if [idf, word.lower()] not in mappings: # lowercased! (!important)

														mappings.append([idf, word.lower()]) # lowercased! (!important)

									mappings.sort(reverse=True)

									return mappings, filenames


								def request_mappings_for_document(name):

									index = load_index()

									filenames = [document for document, _ in index.items()]

									mappings = {}

									for document, _ in index.items():

										if document == name:

											sentences = []

											for sentence in index[document]['sentences']:

												words = []

												for word in tokenizer.tokenize(sentence):

													tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)

													words.append([word, tfidf])

												sentences.append(words)

											mappings[document] = sentences

									# pp.pprint(mappings)

									return mappings, filenames