cross-readers/cross-reader.tfidf/tfidf.py


								import os, json, re

								from math import log, exp


								import nltk

								from nltk import sent_tokenize

								from nltk.tokenize import RegexpTokenizer

								tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer


								import pprint

								pp = pprint.PrettyPrinter(indent=4)


								def tfidf(query, words, corpus):

									# Term Frequency

									tf_count = 0

									for word in words:

										if query == word:

											tf_count += 1

									tf = tf_count/len(words)

									# print('TF count:', tf_count)

									# print('Total number of words:', len(words))

									# print('TF - count/total', tf_count/len(words))


									# Inverse Document Frequency

									idf_count = 0

									for words in corpus:

										if query in words:

											idf_count += 1

									# print('count:', idf_count)

									idf = log(len(corpus)/idf_count)

									# print('Total number of documents:', len(corpus))

									# print('documents/count', len(corpus)/idf_count)

									# print('IDF - log(documents/count)', log(len(corpus)/idf_count))


									tfidf_value = tf * idf

									# print('TF-IDF:', tfidf_value)


									return tf, idf_count, tfidf_value


								def get_language(document):

									match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)

									if match:

										language = match.group().replace('[','').replace(']','').lower()

									else:

										language = 'undefined'

									return language


								def load_text_files():

									files = []

									corpus = []

									sentences = {}

									wordlists = {}

									dir = 'txt'


									for document in sorted(os.listdir(dir)):

										document = document.replace('.txt','')

										# print('document:', document)

										lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file

										lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines

										words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important)

										wordlists[document] = words

										corpus.append(words)

										s = sent_tokenize(lines)

										sentences[document] = s

										files.append(document) # list of filenames


									print('---------')

									print('*txt files loaded*')

									return files, corpus, sentences, wordlists


								def make_human_readable_name(document):

									name = document.replace('_', ' ').replace('-', ' ')

									return name


								def create_index():

									files, corpus, sentences, wordlists = load_text_files()

									index = {}


									# index = {

									# 	Fem document : {

									# 		'sentences' : [],

									# 		'tf' : {

									# 			'aap': 4,

									# 			'beer': 6,

									# 			'citroen': 2

									# 		},

									# 		'idf' : {

									# 			'aap': 2,

									# 			'beer': 1,

									# 			'citroen': 5

									# 		},

									# 		'tfidf' : {

									# 			'aap': 39.2,

									# 			'beer': 20.456,

									# 			'citroen': 3.21

									# 		},

									#		'name': 'Feminist document (2000)',

									# 		'language': 'en'

									# 	}

									# }


									for document in files:

										print('---------')

										print('document:', document)

										index[document] = {}

										index[document]['sentences'] = sentences[document]

										words = wordlists[document]

										for word in words:

											tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)

											if 'tf' not in index[document]:

												index[document]['tf'] = {}

											index[document]['tf'][word] = tf_count

											if 'idf' not in index[document]:

												index[document]['idf'] = {}

											index[document]['idf'][word] = idf_count

											if 'tfidf' not in index[document]:

												index[document]['tfidf'] = {}

											index[document]['tfidf'][word] = tfidf_value

											index[document]['language'] = get_language(document)


										index[document]['name'] = make_human_readable_name(document)


									with open('index.json','w+') as out:

										out.write(json.dumps(index, indent=4, sort_keys=True))

										out.close()

									print('---------')

									print('*index created*')

									print('---------')


								# create_index()