cross-reader/tfidf.py


								import os, json, re

								from math import log, exp


								import nltk

								from nltk import sent_tokenize

								from nltk.tokenize import RegexpTokenizer

								tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer


								import pprint

								pp = pprint.PrettyPrinter(indent=4)


								def tfidf(query, words, corpus):

									# Term Frequency

									tf_count = 0

									for word in words:

										if query == word:

											tf_count += 1

									tf = tf_count/len(words)

									# print('TF count:', tf_count)

									# print('Total number of words:', len(words))

									# print('TF - count/total', tf_count/len(words))


									# Inverse Document Frequency

									idf_count = 0

									for words in corpus:

										if query in words:

											idf_count += 1

									# print('count:', idf_count)

									idf = log(len(corpus)/idf_count)

									# print('Total number of documents:', len(corpus))

									# print('documents/count', len(corpus)/idf_count)

									# print('IDF - log(documents/count)', log(len(corpus)/idf_count))


									tfidf_value = tf * idf

									# print('TF-IDF:', tfidf_value)


									return tf_count, tf_count, tfidf_value


								def get_language(manifesto):

									language = re.search(r'\[.*\]', manifesto, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()

									return language


								def load_text_files():

									files = []

									corpus = {}

									sentences = {}

									wordlists = {}

									languages = {}

									dir = 'txt'


									for manifesto in sorted(os.listdir(dir)):

										manifesto = manifesto.replace('.txt','')

										# print('Manifesto:', manifesto)

										language = get_language(manifesto)

										if language == 'en+de+nl+fr': # exception for OBN manifesto

											language = 'en'

										languages[manifesto] = language

										# print('Language:', language)

										lines = open('{}/{}.txt'.format(dir, manifesto), "r").read() # list of lines in .txt file

										lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines

										words = [word for word in tokenizer.tokenize(lines)] # all words of one manifesto, in reading order

										wordlists[manifesto] = words

										if not language in corpus.keys():

											corpus[language] = []

										corpus[language].append(words)

										s = sent_tokenize(lines)

										sentences[manifesto] = s

										files.append(manifesto) # list of filenames


									print('\n*txt files loaded*')

									return files, corpus, sentences, wordlists, languages


								def make_human_readable_name(manifesto):

									year = re.match(r'^\d\d\d\d', manifesto).group()

									name = manifesto.replace(year, '').replace('_', ' ').replace('-', ' ')

									humanreadablename = '{} ({})'.format(name, year)

									return humanreadablename


								def create_index():

									files, corpus, sentences, wordlists, languages = load_text_files()

									index = {}


									# index = {

									# 	Fem manifesto : {

									# 		'tfidf' : {

									# 			'aap': 39.2,

									# 			'beer': 20.456,

									# 			'citroen': 3.21

									# 		},

									# 		'tf' : {

									# 			'aap': 4,

									# 			'beer': 6,

									# 			'citroen': 2

									# 		},

									#		'name': 'Feminist Manifesto (2000)',

									#		'language': 'en'

									# 	}

									# }


									for manifesto in files:

										print('---------')

										print('Manifesto:', manifesto)

										index[manifesto] = {}

										index[manifesto]['sentences'] = sentences[manifesto]

										language = languages[manifesto]

										words = wordlists[manifesto]

										for word in words:

											tf_count, idf_count, tfidf_value = tfidf(word, words, corpus[language])

											if 'tfidf' not in index[manifesto]:

												index[manifesto]['tfidf'] = {}

											index[manifesto]['tfidf'][word] = tfidf_value

											# if 'tf' not in index[manifesto]:

											# 	index[manifesto]['tf'] = {}

											# index[manifesto]['tf'][word] = tf_count


										index[manifesto]['name'] = make_human_readable_name(manifesto)

										index[manifesto]['language'] = language


									with open('index.json','w+') as out:

										out.write(json.dumps(index, indent=4, sort_keys=True))

										out.close()

									print('*index created*')


								# create_index()