cross-reader/readings.py

import os, json, re
from flask import Markup

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

import tfidf


# TF-IDF visualisation multiplier
multiplier = 25000

def load_index():
	if os.path.isfile('index.json') == False:
		tfidf.create_index()
	f = open('index.json').read()
	index = json.loads(f)
	return index

def get_random(x, y):
	from random import randint
	return randint(x, y)

def generate_random_rgb():
	r = get_random(0, 255)
	g = get_random(0, 255)
	b = get_random(0, 255)
	return r, g, b

def request_mappings_all():
	index = load_index()
	filenames = [manifesto for manifesto, _ in index.items()]
	mappings = {}
	for manifesto, _ in index.items():
		words = []
		for sentence in index[manifesto]['sentences']:
			for word in tokenizer.tokenize(sentence):
				tfidf = index[manifesto]['tfidf'][word] * multiplier
				if [tfidf, word] not in words:
					words.append([tfidf, word])
		words.sort(reverse=True)
		mappings[manifesto] = words
	return mappings

def request_mappings(name):
	index = load_index()
	filenames = [manifesto for manifesto, _ in index.items()]
	mappings = {}
	for manifesto, _ in index.items():
		if manifesto == name:
			sentences = []
			for sentence in index[manifesto]['sentences']:
				words = []
				for word in tokenizer.tokenize(sentence):
					tfidf = index[manifesto]['tfidf'][word] * multiplier
					words.append([word, tfidf])
				sentences.append(words)
			mappings[manifesto] = sentences
	return mappings, filenames

def insert_query_highlight(query, sentence, r, g, b):
	pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$'
	match = re.search(pattern, sentence, flags=re.IGNORECASE)
	if match: 
		match = match.group()
	sentence = re.sub(pattern, ' <strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)
	return sentence

def generate_analytics(query, results, index):
	analytics = {}

	if results:
		manifesto_of_first_result = results[0]['filename']		
		tfidf_results = index[manifesto_of_first_result]['tfidf']
		analytics['suggestions'] = sorted(tfidf_results.items(), key=lambda kv: kv[1], reverse=True)

	# Stemmer (very similar words)
	analytics['stemmer'] = []
	porter = nltk.PorterStemmer()
	basequery = porter.stem(query)
	for manifesto, _ in index.items():
		words = index[manifesto]['tfidf'].keys()
		bases = [[porter.stem(word), word] for word in words]
		for base, word in bases:
			if base == basequery:
				analytics['stemmer'].append(word)
	analytics['stemmer'] = set(analytics['stemmer'])
	if query in analytics['stemmer']:
		analytics['stemmer'].remove(query)

	print('*analytics information returned*')
	return analytics

def request_results(query):
	query = query.strip().lower()
	print('Query:', query)
	print('\n*results request started*')
	index = load_index()
	filenames = [document for document, _ in index.items()]

	results = {}

	# results = {
	# 	0 : {
	#    'name' : 'Feminist manifesto (2000)',
	#    'filename' : '2000_Feminist_manifesto',
	# 		'tfidf' : 0.00041,
	#		'matches' : [
	#			'This is a first matching sentence.',
	# 			'This is a second matching sentence.',
	# 			'This is a third matching sentence.'
	#		]
	# 	}
	# }

	# First, sort the matching manifestos on TF-IDF values
	order = []
	for manifesto, _ in index.items():
		for key in index[manifesto]['tfidf'].keys():
			if query == key.lower():
				# print('Query match:', query)
				match = [index[manifesto]['tfidf'][key], manifesto]
				order.append(match)
				break
	order.sort(reverse=True)

	# Loop through the sorted matches
	# and add all the data that is needed
	# (sentences, tfidf value, manifesto name)
	x = 0
	for tfidf, manifesto in order:
		results[x] = {}
		results[x]['name'] = index[manifesto]['name'] # nicely readable name
		results[x]['filename'] = manifesto
		results[x]['tfidf'] = tfidf
		results[x]['matches'] = []
		results[x]['html'] = []

		# Generate a random RGB color for this manifesto
		r, g, b = generate_random_rgb()

		# All sentences from this manifesto
		sentences = index[manifesto]['sentences']

		# Collect matching sentences only
		for sentence in sentences:
			for word in tokenizer.tokenize(sentence):
				
				if word.lower() == query:

					# Append sentence to final set of matching results
					results[x]['matches'].append(sentence)

					# Transform sentence into an HTML elements
					html = insert_query_highlight(query, sentence, r, g, b)
					html = Markup(html)
					results[x]['html'].append(html)

					break # Append sentence only once
		x += 1

	# Add analytics
	analytics = generate_analytics(query, results, index)

	return results, filenames, analytics

if __name__ == '__main__':
    request_results('personal')
pushing the cross-reader files to the git 2019-02-27 10:01:48 +01:00			`import os, json, re`
			`from flask import Markup`

			`import nltk`
			`from nltk.tokenize import RegexpTokenizer`
			`tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer`

			`import pprint`
			`pp = pprint.PrettyPrinter(indent=4)`

			`import tfidf`


			`# TF-IDF visualisation multiplier`
			`multiplier = 25000`

			`def load_index():`
			`if os.path.isfile('index.json') == False:`
			`tfidf.create_index()`
			`f = open('index.json').read()`
			`index = json.loads(f)`
			`return index`

			`def get_random(x, y):`
			`from random import randint`
			`return randint(x, y)`

			`def generate_random_rgb():`
			`r = get_random(0, 255)`
			`g = get_random(0, 255)`
			`b = get_random(0, 255)`
			`return r, g, b`

			`def request_mappings_all():`
			`index = load_index()`
			`filenames = [manifesto for manifesto, _ in index.items()]`
			`mappings = {}`
			`for manifesto, _ in index.items():`
			`words = []`
			`for sentence in index[manifesto]['sentences']:`
			`for word in tokenizer.tokenize(sentence):`
			`tfidf = index[manifesto]['tfidf'][word] * multiplier`
			`if [tfidf, word] not in words:`
			`words.append([tfidf, word])`
			`words.sort(reverse=True)`
			`mappings[manifesto] = words`
			`return mappings`

			`def request_mappings(name):`
			`index = load_index()`
			`filenames = [manifesto for manifesto, _ in index.items()]`
			`mappings = {}`
			`for manifesto, _ in index.items():`
			`if manifesto == name:`
			`sentences = []`
			`for sentence in index[manifesto]['sentences']:`
			`words = []`
			`for word in tokenizer.tokenize(sentence):`
			`tfidf = index[manifesto]['tfidf'][word] * multiplier`
			`words.append([word, tfidf])`
			`sentences.append(words)`
			`mappings[manifesto] = sentences`
			`return mappings, filenames`

			`def insert_query_highlight(query, sentence, r, g, b):`
			`pattern = r'[\s\W\_]'+query+r'[\s\W\_]\|^'+query+'\|'+query+'$'`
			`match = re.search(pattern, sentence, flags=re.IGNORECASE)`
			`if match:`
			`match = match.group()`
			`sentence = re.sub(pattern, ' <strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)`
			`return sentence`

			`def generate_analytics(query, results, index):`
			`analytics = {}`

updating the cross-reader for the CG exhibition in Eindhoven (only English, some layout changes, seperate 'about' pages, added TF/IDF/TF-IDF value inspection lists) 2019-07-10 20:39:06 +02:00			`if results:`
			`manifesto_of_first_result = results[0]['filename']`
			`tfidf_results = index[manifesto_of_first_result]['tfidf']`
			`analytics['suggestions'] = sorted(tfidf_results.items(), key=lambda kv: kv[1], reverse=True)`
pushing the cross-reader files to the git 2019-02-27 10:01:48 +01:00
			`# Stemmer (very similar words)`
			`analytics['stemmer'] = []`
			`porter = nltk.PorterStemmer()`
			`basequery = porter.stem(query)`
			`for manifesto, _ in index.items():`
			`words = index[manifesto]['tfidf'].keys()`
			`bases = [[porter.stem(word), word] for word in words]`
			`for base, word in bases:`
			`if base == basequery:`
			`analytics['stemmer'].append(word)`
			`analytics['stemmer'] = set(analytics['stemmer'])`
			`if query in analytics['stemmer']:`
			`analytics['stemmer'].remove(query)`

			`print('analytics information returned')`
			`return analytics`

			`def request_results(query):`
			`query = query.strip().lower()`
			`print('Query:', query)`
updating the cross-reader for the CG exhibition in Eindhoven (only English, some layout changes, seperate 'about' pages, added TF/IDF/TF-IDF value inspection lists) 2019-07-10 20:39:06 +02:00			`print('\nresults request started')`
pushing the cross-reader files to the git 2019-02-27 10:01:48 +01:00			`index = load_index()`
			`filenames = [document for document, _ in index.items()]`

			`results = {}`

			`# results = {`
			`# 0 : {`
			`# 'name' : 'Feminist manifesto (2000)',`
			`# 'filename' : '2000_Feminist_manifesto',`
			`# 'tfidf' : 0.00041,`
			`# 'matches' : [`
			`# 'This is a first matching sentence.',`
			`# 'This is a second matching sentence.',`
			`# 'This is a third matching sentence.'`
			`# ]`
			`# }`
			`# }`

			`# First, sort the matching manifestos on TF-IDF values`
			`order = []`
			`for manifesto, _ in index.items():`
			`for key in index[manifesto]['tfidf'].keys():`
			`if query == key.lower():`
			`# print('Query match:', query)`
			`match = [index[manifesto]['tfidf'][key], manifesto]`
			`order.append(match)`
			`break`
			`order.sort(reverse=True)`

			`# Loop through the sorted matches`
			`# and add all the data that is needed`
			`# (sentences, tfidf value, manifesto name)`
			`x = 0`
			`for tfidf, manifesto in order:`
			`results[x] = {}`
			`results[x]['name'] = index[manifesto]['name'] # nicely readable name`
			`results[x]['filename'] = manifesto`
			`results[x]['tfidf'] = tfidf`
			`results[x]['matches'] = []`
			`results[x]['html'] = []`

			`# Generate a random RGB color for this manifesto`
			`r, g, b = generate_random_rgb()`

			`# All sentences from this manifesto`
			`sentences = index[manifesto]['sentences']`

			`# Collect matching sentences only`
			`for sentence in sentences:`
			`for word in tokenizer.tokenize(sentence):`

			`if word.lower() == query:`

			`# Append sentence to final set of matching results`
			`results[x]['matches'].append(sentence)`

			`# Transform sentence into an HTML elements`
			`html = insert_query_highlight(query, sentence, r, g, b)`
			`html = Markup(html)`
			`results[x]['html'].append(html)`

			`break # Append sentence only once`
			`x += 1`

			`# Add analytics`
updating the cross-reader for the CG exhibition in Eindhoven (only English, some layout changes, seperate 'about' pages, added TF/IDF/TF-IDF value inspection lists) 2019-07-10 20:39:06 +02:00			`analytics = generate_analytics(query, results, index)`
pushing the cross-reader files to the git 2019-02-27 10:01:48 +01:00
			`return results, filenames, analytics`
updating the cross-reader for the CG exhibition in Eindhoven (only English, some layout changes, seperate 'about' pages, added TF/IDF/TF-IDF value inspection lists) 2019-07-10 20:39:06 +02:00
			`if __name__ == '__main__':`
			`request_results('personal')`