You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
130 lines
3.4 KiB
130 lines
3.4 KiB
import os, json, re
|
|
from math import log, exp
|
|
|
|
import nltk
|
|
from nltk import sent_tokenize
|
|
from nltk.tokenize import RegexpTokenizer
|
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
|
|
|
import pprint
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
def tfidf(query, words, corpus):
|
|
# Term Frequency
|
|
tf_count = 0
|
|
for word in words:
|
|
if query == word:
|
|
tf_count += 1
|
|
tf = tf_count/len(words)
|
|
# print('TF count:', tf_count)
|
|
# print('Total number of words:', len(words))
|
|
# print('TF - count/total', tf_count/len(words))
|
|
|
|
# Inverse Document Frequency
|
|
idf_count = 0
|
|
for words in corpus:
|
|
if query in words:
|
|
idf_count += 1
|
|
# print('count:', idf_count)
|
|
idf = log(len(corpus)/idf_count)
|
|
# print('Total number of documents:', len(corpus))
|
|
# print('documents/count', len(corpus)/idf_count)
|
|
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
|
|
|
|
tfidf_value = tf * idf
|
|
# print('TF-IDF:', tfidf_value)
|
|
|
|
return tf_count, idf_count, tfidf_value
|
|
|
|
def get_language(document):
|
|
match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
|
|
if match:
|
|
language = match.group().replace('[','').replace(']','').lower()
|
|
else:
|
|
language = 'undefined'
|
|
return language
|
|
|
|
def load_text_files():
|
|
files = []
|
|
corpus = []
|
|
sentences = {}
|
|
wordlists = {}
|
|
dir = 'txt'
|
|
|
|
for document in sorted(os.listdir(dir)):
|
|
document = document.replace('.txt','')
|
|
# print('document:', document)
|
|
lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file
|
|
lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
|
|
words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important)
|
|
wordlists[document] = words
|
|
corpus.append(words)
|
|
s = sent_tokenize(lines)
|
|
sentences[document] = s
|
|
files.append(document) # list of filenames
|
|
|
|
print('---------')
|
|
print('*txt files loaded*')
|
|
return files, corpus, sentences, wordlists
|
|
|
|
def make_human_readable_name(document):
|
|
name = document.replace('_', ' ').replace('-', ' ')
|
|
return name
|
|
|
|
def create_index():
|
|
files, corpus, sentences, wordlists = load_text_files()
|
|
index = {}
|
|
|
|
# index = {
|
|
# Fem document : {
|
|
# 'sentences' : [],
|
|
# 'tf' : {
|
|
# 'aap': 4,
|
|
# 'beer': 6,
|
|
# 'citroen': 2
|
|
# },
|
|
# 'idf' : {
|
|
# 'aap': 2,
|
|
# 'beer': 1,
|
|
# 'citroen': 5
|
|
# },
|
|
# 'tfidf' : {
|
|
# 'aap': 39.2,
|
|
# 'beer': 20.456,
|
|
# 'citroen': 3.21
|
|
# },
|
|
# 'name': 'Feminist document (2000)',
|
|
# 'language': 'en'
|
|
# }
|
|
# }
|
|
|
|
|
|
for document in files:
|
|
print('---------')
|
|
print('document:', document)
|
|
index[document] = {}
|
|
index[document]['sentences'] = sentences[document]
|
|
words = wordlists[document]
|
|
for word in words:
|
|
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
|
|
if 'tf' not in index[document]:
|
|
index[document]['tf'] = {}
|
|
index[document]['tf'][word] = tf_count
|
|
if 'idf' not in index[document]:
|
|
index[document]['idf'] = {}
|
|
index[document]['idf'][word] = idf_count
|
|
if 'tfidf' not in index[document]:
|
|
index[document]['tfidf'] = {}
|
|
index[document]['tfidf'][word] = tfidf_value
|
|
index[document]['language'] = get_language(document)
|
|
|
|
index[document]['name'] = make_human_readable_name(document)
|
|
|
|
with open('index.json','w+') as out:
|
|
out.write(json.dumps(index, indent=4, sort_keys=True))
|
|
out.close()
|
|
print('---------')
|
|
print('*index created*')
|
|
print('---------')
|
|
|
|
# create_index()
|