@ -37,7 +37,11 @@ def tfidf(query, words, corpus):
return tf_count , idf_count , tfidf_value
def get_language ( document ) :
language = re . search ( r ' \ [.* \ ] ' , document , flags = re . IGNORECASE ) . group ( ) . replace ( ' [ ' , ' ' ) . replace ( ' ] ' , ' ' ) . lower ( )
match = re . search ( r ' \ [.* \ ] ' , document , flags = re . IGNORECASE )
if match :
language = match . group ( ) . replace ( ' [ ' , ' ' ) . replace ( ' ] ' , ' ' ) . lower ( )
else :
language = ' undefined '
return language
def load_text_files ( ) :
@ -59,6 +63,7 @@ def load_text_files():
sentences [ document ] = s
files . append ( document ) # list of filenames
print ( ' --------- ' )
print ( ' *txt files loaded* ' )
return files , corpus , sentences , wordlists
@ -72,21 +77,28 @@ def create_index():
# index = {
# Fem document : {
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'sentences' : [],
# 'tf' : {
# 'aap': 4,
# 'beer': 6,
# 'citroen': 2
# },
# 'idf' : {
# 'aap': 2,
# 'beer': 1,
# 'citroen': 5
# },
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'name': 'Feminist document (2000)',
# 'language': 'en'
# }
# }
for document in files :
print ( ' --------- ' )
print ( ' document: ' , document )
@ -104,12 +116,15 @@ def create_index():
if ' tfidf ' not in index [ document ] :
index [ document ] [ ' tfidf ' ] = { }
index [ document ] [ ' tfidf ' ] [ word ] = tfidf_value
index [ document ] [ ' language ' ] = get_language ( document )
index [ document ] [ ' name ' ] = make_human_readable_name ( document )
with open ( ' index.json ' , ' w+ ' ) as out :
out . write ( json . dumps ( index , indent = 4 , sort_keys = True ) )
out . close ( )
print ( ' --------- ' )
print ( ' *index created* ' )
print ( ' --------- ' )
# create_index()