import os, json, re from math import log, exp import nltk from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def tfidf(query, words, corpus): # Term Frequency tf_count = 0 for word in words: if query == word: tf_count += 1 tf = tf_count/len(words) # print('TF count:', tf_count) # print('Total number of words:', len(words)) # print('TF - count/total', tf_count/len(words)) # Inverse Document Frequency idf_count = 0 for words in corpus: if query in words: idf_count += 1 # print('count:', idf_count) idf = log(len(corpus)/idf_count) # print('Total number of documents:', len(corpus)) # print('documents/count', len(corpus)/idf_count) # print('IDF - log(documents/count)', log(len(corpus)/idf_count)) tfidf_value = tf * idf # print('TF-IDF:', tfidf_value) return tf, idf_count, tfidf_value def get_language(document): match = re.search(r'\[.*\]', document, flags=re.IGNORECASE) if match: language = match.group().replace('[','').replace(']','').lower() else: language = 'undefined' return language def load_text_files(): files = [] corpus = [] sentences = {} wordlists = {} dir = 'txt' for document in sorted(os.listdir(dir)): document = document.replace('.txt','') # print('document:', document) lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important) wordlists[document] = words corpus.append(words) s = sent_tokenize(lines) sentences[document] = s files.append(document) # list of filenames print('---------') print('*txt files loaded*') return files, corpus, sentences, wordlists def make_human_readable_name(document): name = document.replace('_', ' ').replace('-', ' ') return name def create_index(): files, corpus, sentences, wordlists = load_text_files() index = {} # index = { # Fem document : { # 'sentences' : [], # 'tf' : { # 'aap': 4, # 'beer': 6, # 'citroen': 2 # }, # 'idf' : { # 'aap': 2, # 'beer': 1, # 'citroen': 5 # }, # 'tfidf' : { # 'aap': 39.2, # 'beer': 20.456, # 'citroen': 3.21 # }, # 'name': 'Feminist document (2000)', # 'language': 'en' # } # } for document in files: print('---------') print('document:', document) index[document] = {} index[document]['sentences'] = sentences[document] words = wordlists[document] for word in words: tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) if 'tf' not in index[document]: index[document]['tf'] = {} index[document]['tf'][word] = tf_count if 'idf' not in index[document]: index[document]['idf'] = {} index[document]['idf'][word] = idf_count if 'tfidf' not in index[document]: index[document]['tfidf'] = {} index[document]['tfidf'][word] = tfidf_value index[document]['language'] = get_language(document) index[document]['name'] = make_human_readable_name(document) with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() print('---------') print('*index created*') print('---------') # create_index()