import os, json, re from math import log, exp import nltk from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def tfidf(query, words, corpus): # Term Frequency tf_count = 0 for word in words: if query == word: tf_count += 1 tf = tf_count/len(words) # print('TF count:', tf_count) # print('Total number of words:', len(words)) # print('TF - count/total', tf_count/len(words)) # Inverse Document Frequency idf_count = 0 for words in corpus: if query in words: idf_count += 1 # print('count:', idf_count) idf = log(len(corpus)/idf_count) # print('Total number of documents:', len(corpus)) # print('documents/count', len(corpus)/idf_count) # print('IDF - log(documents/count)', log(len(corpus)/idf_count)) tfidf_value = tf * idf # print('TF-IDF:', tfidf_value) return tf_count, tf_count, tfidf_value def get_language(manifesto): language = re.search(r'\[.*\]', manifesto, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower() return language def load_text_files(): files = [] corpus = {} sentences = {} wordlists = {} languages = {} dir = 'txt' for manifesto in sorted(os.listdir(dir)): manifesto = manifesto.replace('.txt','') # print('Manifesto:', manifesto) language = get_language(manifesto) if language == 'en+de+nl+fr': # exception for OBN manifesto language = 'en' languages[manifesto] = language # print('Language:', language) lines = open('{}/{}.txt'.format(dir, manifesto), "r").read() # list of lines in .txt file lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines words = [word for word in tokenizer.tokenize(lines)] # all words of one manifesto, in reading order wordlists[manifesto] = words if not language in corpus.keys(): corpus[language] = [] corpus[language].append(words) s = sent_tokenize(lines) sentences[manifesto] = s files.append(manifesto) # list of filenames print('\n*txt files loaded*') return files, corpus, sentences, wordlists, languages def make_human_readable_name(manifesto): year = re.match(r'^\d\d\d\d', manifesto).group() name = manifesto.replace(year, '').replace('_', ' ').replace('-', ' ') humanreadablename = '{} ({})'.format(name, year) return humanreadablename def create_index(): files, corpus, sentences, wordlists, languages = load_text_files() index = {} # index = { # Fem manifesto : { # 'tfidf' : { # 'aap': 39.2, # 'beer': 20.456, # 'citroen': 3.21 # }, # 'tf' : { # 'aap': 4, # 'beer': 6, # 'citroen': 2 # }, # 'name': 'Feminist Manifesto (2000)', # 'language': 'en' # } # } for manifesto in files: print('---------') print('Manifesto:', manifesto) index[manifesto] = {} index[manifesto]['sentences'] = sentences[manifesto] language = languages[manifesto] words = wordlists[manifesto] for word in words: tf_count, idf_count, tfidf_value = tfidf(word, words, corpus[language]) if 'tfidf' not in index[manifesto]: index[manifesto]['tfidf'] = {} index[manifesto]['tfidf'][word] = tfidf_value # if 'tf' not in index[manifesto]: # index[manifesto]['tf'] = {} # index[manifesto]['tf'][word] = tf_count index[manifesto]['name'] = make_human_readable_name(manifesto) index[manifesto]['language'] = language with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() print('*index created*') # create_index()