import os, json, re
from math import log, exp

import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

def tfidf(query, words, corpus):
	# Term Frequency
	tf_count = 0
	for word in words:
		if query == word:
			tf_count += 1
	tf = tf_count/len(words)

	# Inverse Document Frequency
	idf_count = 0
	for words in corpus:
		if query in words:
			idf_count += 1
	idf = log(len(corpus)/idf_count)

	tfidf_value = tf * idf
	return tf_count, idf_count, tfidf_value 

def get_language(manifesto):
	language = re.search(r'\[.*\]', manifesto, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()
	return language

def load_text_files():
	files = []
	corpus = {}
	sentences = {}
	wordlists = {}
	languages = {}
	dir = 'txt'

	for manifesto in sorted(os.listdir(dir)):
		manifesto = manifesto.replace('.txt','')
		language = get_language(manifesto)
		if language == 'en+de+nl+fr': # exception for OBN manifesto
			language = 'en'
		languages[manifesto] = language
		lines = open('{}/{}.txt'.format(dir, manifesto), "r").read() # list of lines in .txt file
		lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
		words = [word for word in tokenizer.tokenize(lines)] # all words of one manifesto, in reading order
		wordlists[manifesto] = words
		if not language in corpus.keys():
			corpus[language] = []
		corpus[language].append(words) 
		s = sent_tokenize(lines)
		sentences[manifesto] = s
		files.append(manifesto) # list of filenames

	print('\n*txt files loaded*')
	return files, corpus, sentences, wordlists, languages

def make_human_readable_name(manifesto):
	year = re.match(r'^\d\d\d\d', manifesto).group()
	name = manifesto.replace(year, '').replace('_', ' ').replace('-', ' ')
	humanreadablename = '{} ({})'.format(name, year)
	return humanreadablename

def create_index():
	files, corpus, sentences, wordlists, languages = load_text_files()
	index = {}

	# index = {
	# 	Fem manifesto : {
	# 		'tfidf' : {
	# 			'aap': 39.2,
	# 			'beer': 20.456,
	# 			'citroen': 3.21
	# 		},
	# 		'tf' : {
	# 			'aap': 4,
	# 			'beer': 6,
	# 			'citroen': 2
	# 		},
	#		'name': 'Feminist Manifesto (2000)',
	#		'language': 'en'
	# 	}
	# }

	for manifesto in files:
		print('---------')
		print('Manifesto:', manifesto)
		index[manifesto] = {}
		index[manifesto]['sentences'] = sentences[manifesto]
		language = languages[manifesto]
		words = wordlists[manifesto]
		for word in words:
			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus[language])
			if 'tfidf' not in index[manifesto]:
				index[manifesto]['tfidf'] = {}
			index[manifesto]['tfidf'][word] = tfidf_value
			if 'idf' not in index[manifesto]:
				index[manifesto]['idf'] = {}
			index[manifesto]['idf'][word] = idf_count
			if 'tf' not in index[manifesto]:
				index[manifesto]['tf'] = {}
			index[manifesto]['tf'][word] = tf_count

		index[manifesto]['name'] = make_human_readable_name(manifesto)
		index[manifesto]['language'] = language

	with open('index.json','w+') as out:
		out.write(json.dumps(index, indent=4, sort_keys=True))
		out.close()
	print('*index created*')


if __name__ == '__main__':
    create_index()