import os, json, re
from math import log, exp

import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

def tfidf(query, words, corpus):
	# Term Frequency
	tf_count = 0
	for word in words:
		if query == word:
			tf_count += 1
	tf = tf_count/len(words)
	# print('TF count:', tf_count)
	# print('Total number of words:', len(words))
	# print('TF - count/total', tf_count/len(words))

	# Inverse Document Frequency
	idf_count = 0
	for words in corpus:
		if query in words:
			idf_count += 1
	# print('count:', idf_count)
	idf = log(len(corpus)/idf_count)
	# print('Total number of documents:', len(corpus))
	# print('documents/count', len(corpus)/idf_count)
	# print('IDF - log(documents/count)', log(len(corpus)/idf_count))

	tfidf_value = tf * idf
	# print('TF-IDF:', tfidf_value)

	return tf, idf_count, tfidf_value 

def get_language(document):
	match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
	if match:
		language = match.group().replace('[','').replace(']','').lower()
	else:
		language = 'undefined'
	return language

def load_text_files():
	files = []
	corpus = []
	sentences = {}
	wordlists = {}
	dir = 'txt'

	for document in sorted(os.listdir(dir)):
		document = document.replace('.txt','')
		# print('document:', document)
		lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file
		lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
		words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important)
		wordlists[document] = words
		corpus.append(words) 
		s = sent_tokenize(lines)
		sentences[document] = s
		files.append(document) # list of filenames

	print('---------')
	print('*txt files loaded*')
	return files, corpus, sentences, wordlists

def make_human_readable_name(document):
	name = document.replace('_', ' ').replace('-', ' ')
	return name

def create_index():
	files, corpus, sentences, wordlists = load_text_files()
	index = {}

	# index = {
	# 	Fem document : {
	# 		'sentences' : [],
	# 		'tf' : {
	# 			'aap': 4,
	# 			'beer': 6,
	# 			'citroen': 2
	# 		},
	# 		'idf' : {
	# 			'aap': 2,
	# 			'beer': 1,
	# 			'citroen': 5
	# 		},
	# 		'tfidf' : {
	# 			'aap': 39.2,
	# 			'beer': 20.456,
	# 			'citroen': 3.21
	# 		},
	#		'name': 'Feminist document (2000)',
	# 		'language': 'en'
	# 	}
	# }


	for document in files:
		print('---------')
		print('document:', document)
		index[document] = {}
		index[document]['sentences'] = sentences[document]
		words = wordlists[document]
		for word in words:
			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
			if 'tf' not in index[document]:
				index[document]['tf'] = {}
			index[document]['tf'][word] = tf_count
			if 'idf' not in index[document]:
				index[document]['idf'] = {}
			index[document]['idf'][word] = idf_count
			if 'tfidf' not in index[document]:
				index[document]['tfidf'] = {}
			index[document]['tfidf'][word] = tfidf_value
			index[document]['language'] = get_language(document)

		index[document]['name'] = make_human_readable_name(document)

	with open('index.json','w+') as out:
		out.write(json.dumps(index, indent=4, sort_keys=True))
		out.close()
	print('---------')
	print('*index created*')
	print('---------')

# create_index()