import os, json, re
from flask import Markup

import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

import tfidf


# TF-IDF visualisation multiplier
multiplier = 25000


def load_index():
	if os.path.isfile('index.json') == False:
		tfidf.create_index()
	f = open('index.json').read()
	index = json.loads(f)
	return index

def get_random(x, y):
	from random import randint
	return randint(x, y)

def generate_random_rgb():
	r = get_random(0, 255)
	g = get_random(0, 255)
	b = get_random(0, 255)
	return r, g, b

def insert_query_highlight(query, tfidf, sentence, r, g, b):
	pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$'
	match = re.search(pattern, sentence, flags=re.IGNORECASE)
	if match: 
		match = match.group().replace(' ', '')
	sentence = re.sub(pattern, ' <strong class="query" style="font-size:{tfidf}%;color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(tfidf=tfidf, match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)
	return sentence

def insert_suggestion_links(query, sentence):
	# insert further reading links
	for suggestion in open('words.txt','r').readlines():
		suggestion = suggestion.replace('\n', '').strip()
		if suggestion:
			if suggestion != query:
				pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$'
				match = re.search(pattern, sentence, flags=re.IGNORECASE)
				if match:
					match = match.group()
					match = match.replace(suggestion, '<a href="?q={0}">{0}</a>'.format(suggestion))
					sentence = re.sub(pattern, ' <strong>{}</strong> '.format(match), sentence, flags=re.IGNORECASE)
	return sentence

def get_adjectives():
	index = load_index()
	sentences_all = [sentences for sentences in document['sentences'] for document, _ in index.items()]
	adjectives = []
	for sentences in sentences_all:
		for sentence in sentences:
			pos = nltk.pos_tag(words)

	return adjectives

def generate_analytics(query, results, index):
	analytics = {}
	querypos = nltk.pos_tag([query])
	analytics['type'] = querypos[0][1]

	# Contrast-mapping
	# analytics['mapping'] = []
	# if results[0]['matches']:
		# for word in tokenizer.tokenize(results[0]['matches'][0]):
			# document = results[0]['filename']
			# analytics['mapping'].append([word, index[document]['tfidf'][word.lower()] * multiplier]) # lowercased! (!important)

	# Stemmer (very similar words)
	analytics['stemmer'] = []
	porter = nltk.PorterStemmer()
	basequery = porter.stem(query)
	for document, _ in index.items():
		words = index[document]['tfidf'].keys()
		bases = [[porter.stem(word), word] for word in words]
		# print('Stemmer bases', bases)
		for base, word in bases:
			if base == basequery:
				analytics['stemmer'].append(word)
	analytics['stemmer'] = set(analytics['stemmer'])
	if query in analytics['stemmer']:
		analytics['stemmer'].remove(query)
	# print('Stemmer:', matches)

	print('*analytics information returned*')
	# pp.pprint(analytics)
	return analytics

def request_results(query):
	print('*results request started*')
	query = query.strip().lower()
	print('Query:', query)
	index = load_index()
	filenames = [document for document, _ in index.items()]

	results = {}

	# results = {
	# 	0 : {
	#       'name' : 'Feminist document (2000)',
	#       'filename' : '2000_Feminist_document',
	# 		'tfidf' : 0.00041,
	#		'matches' : [
	#			'This is a first matching sentence.',
	# 			'This is a second matching sentence.',
	# 			'This is a third matching sentence.'
	#		]
	# 	}
	# }

	# First, check which documents use the query 
	order = []
	for document, _ in index.items():
		for key in index[document]['tfidf'].keys():
			if query == key.strip().lower():
				print('Query match:', query)
				match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
				order.append(match)
				break
	order.sort(reverse=True)
	print('Order:', order)

	# Loop through the sorted matches
	# and add all the data that is needed
	# (sentences, tfidf value, document name)
	x = 0
	for tfidf, document in order:
		# print('document:', document)
		results[x] = {}
		results[x]['name'] = index[document]['name'] # nicely readable name
		results[x]['filename'] = document
		results[x]['tfidf'] = tfidf
		results[x]['matches'] = []
		results[x]['html'] = []

		# Generate a random RGB color for this document
		r, g, b = generate_random_rgb()

		# All sentences from this document
		sentences = index[document]['sentences']

		# Collect matching sentences only
		for sentence in sentences:
			for word in tokenizer.tokenize(sentence):
				if word.lower() == query:
					
					# Append sentence to final set of matching results
					results[x]['matches'].append(sentence)
					
					# Transform sentence into an HTML elements
					html = insert_query_highlight(query.strip(), 100 + (tfidf * multiplier), sentence, r, g, b)
					html = insert_suggestion_links(query, html)
					html = Markup(html)
					results[x]['html'].append(html)

					break # Append sentence only once
		x += 1

	pp.pprint(results)
	print('*results returned*')

	# Add analytics
	if results.keys():
		analytics = generate_analytics(query, results, index)
	else:
		analytics = False
	# pp.pprint(analytics)

	return filenames, results, analytics

def request_mappings(mapping_type):
	index = load_index()
	filenames = [document for document, _ in index.items()]
	mappings = []
	for document, _ in index.items():
		sentences = []
		for sentence in index[document]['sentences']:
			for word in tokenizer.tokenize(sentence):
				if mapping_type == 'tfidf' or mapping_type == 'tfidf-mapping':
					tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)
					if [tfidf, word.lower()] not in mappings: # lowercased! (!important)
						mappings.append([tfidf, word.lower()]) # lowercased! (!important)
				# if mapping_type == 'tf':
				# 	tf = index[document]['tf'][word.lower()] # lowercased! (!important)
				# 	if [tf, word.lower()] not in mappings: # lowercased! (!important)
				# 		mappings.append([tf, word.lower()]) # lowercased! (!important)
				if mapping_type == 'idf':
					idf = index[document]['idf'][word.lower()] # lowercased! (!important)
					if [idf, word.lower()] not in mappings: # lowercased! (!important)
						mappings.append([idf, word.lower()]) # lowercased! (!important)
	mappings.sort(reverse=True)
	return mappings, filenames

def request_mappings_for_document(name):
	index = load_index()
	filenames = [document for document, _ in index.items()]
	mappings = {}
	for document, _ in index.items():
		if document == name:
			sentences = []
			for sentence in index[document]['sentences']:
				words = []
				for word in tokenizer.tokenize(sentence):
					tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)
					words.append([word, tfidf])
				sentences.append(words)
			mappings[document] = sentences
	# pp.pprint(mappings)
	return mappings, filenames