import os, json
from flask import Markup

import tfidf

from nltk import tokenize, pos_tag, RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

def load_index():
	if os.path.isfile('index.json') == False:
		tfidf.create_index()
	f = open('index.json').read()
	index = json.loads(f)
	return index

def get_random(x, y):
	from random import randint
	return randint(x, y)

def generate_random_rgb():
	r = get_random(0, 255)
	g = get_random(0, 255)
	b = get_random(0, 255)
	return r, g, b

def get_pos():
	# ---
	# Note: NLTK provides documentation for each tag, 
	# which can be queried using the tag, e.g. 
	# nltk.help.upenn_tagset('RB'), or a regular expression, 
	# e.g. nltk.help.upenn_tagset('NN.*'). Some corpora 
	# have README files with tagset documentation, 
	# see nltk.corpus.???.readme(), substituting in the name 
	# of the corpus. -- http://www.nltk.org/book/ch05.html
	# ---

	# data {
	# 	'word' : {
	#  		'count' : 8,
	# 		'sentences' : {
	# 			'filename' : [
	# 				'This is a sentence.',
	# 				'This is another sentence.'
	# 			]
	# 		}
	# 	}
	# }

	index = load_index()
	sentences_all = [index[document]['sentences'] for document, _ in index.items()]
	data = {}
	data['ADJ'] = {}
	data['PRE'] = {}
	filenames = [filename for filename, _ in index.items()]
	# print(filenames)
	for i, sentences in enumerate(sentences_all):
		r, g, b = generate_random_rgb()
		for sentence in sentences:
			pos = pos_tag(tokenizer.tokenize(sentence))
			# print(pos)
			for word, tag in pos:
				if 'JJ' in tag:
					# ---
					# JJ: adjective or numeral, ordinal
					# For example: third ill-mannered pre-war regrettable oiled calamitous first separable
					# ectoplasmic battery-powered participatory fourth still-to-be-named
					# multilingual multi-disciplinary ...
					if word.lower() not in data['ADJ']:
						data['ADJ'][word.lower()] = {}
					if 'sentences' not in data['ADJ'][word.lower()]:
						data['ADJ'][word.lower()]['sentences'] = {}
					if filenames[i] not in data['ADJ'][word.lower()].keys():
						data['ADJ'][word.lower()]['sentences'][filenames[i]] = []
					s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
					if s not in data['ADJ'][word.lower()]['sentences'][filenames[i]]:
						data['ADJ'][word.lower()]['sentences'][filenames[i]].append(s)
				if 'TO' in tag or 'IN' in tag:
					# ---
					# TO: "to" as preposition (voorzetsel) or infinitive marker (oneindige beïnvloeder?)
					# For example: to
					# ---
					# IN: preposition or conjunction (voegwoord, samenstelling, verbinding), subordinating (ondergeschikt)
				    # For example: astride among uppon whether out inside pro despite on by throughout
				    # below within for towards near behind atop around if like until below
				    # next into if beside ...
					# ---
					if word.lower() not in data['PRE']:
						data['PRE'][word.lower()] = {}
					if 'sentences' not in data['PRE'][word.lower()]:
						data['PRE'][word.lower()]['sentences'] = {}
					if filenames[i] not in data['PRE'][word.lower()]['sentences'].keys():
						data['PRE'][word.lower()]['sentences'][filenames[i]] = []
					s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
					if s not in data['PRE'][word.lower()]['sentences'][filenames[i]]:
						data['PRE'][word.lower()]['sentences'][filenames[i]].append(s)
	
	# count number of results for each word
	for word_type, words in data.items():
		for word, _ in words.items():
			# print(filenames)
			count = 0
			for filename, sentences in data[word_type][word]['sentences'].items():
				# print(filename)
				count += len(sentences)
			data[word_type][word.lower()]['count'] = count
			count = 0

	data_sorted = {}
	for word_type, words in data.items():
		tmp = []
		for word, _ in words.items():
			count = data[word_type][word.lower()]['count']
			tmp.append([count, word])
			i = 0
		tmp.sort(reverse=True)
		print('tmp', tmp)
		for count, word in tmp:
			if word_type not in data_sorted:
				data_sorted[word_type] = {}
			data_sorted[word_type][i] = {}
			data_sorted[word_type][i][word.lower()] = data[word_type][word.lower()]
			i += 1

	print(data_sorted)
	return data_sorted, index

# data, index = get_pos()
# pp.pprint(data)