Browse Source

short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language)

master
manetta 6 years ago
parent
commit
1fda8c7925
  1. 4
      cross-reader.tfidf/readings.py
  2. 4
      cross-reader.tfidf/start.py
  3. 29
      cross-reader.tfidf/tfidf.py
  4. 2
      cross-reader.tfidf/words.txt

4
cross-reader.tfidf/readings.py

@ -119,11 +119,11 @@ def request_results(query):
# } # }
# } # }
# First, sort the matching documents on TF-IDF values # First, check which documents use the query
order = [] order = []
for document, _ in index.items(): for document, _ in index.items():
for key in index[document]['tfidf'].keys(): for key in index[document]['tfidf'].keys():
if query == key.lower(): if query == key.strip().lower():
print('Query match:', query) print('Query match:', query)
match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important) match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
order.append(match) order.append(match)

4
cross-reader.tfidf/start.py

@ -49,7 +49,7 @@ def index():
@APP.route('/document/<name>', methods=['GET', 'POST']) @APP.route('/document/<name>', methods=['GET', 'POST'])
def open_document(name): def open_document(name):
""" """
Open document. Read document.
""" """
index = readings.load_index() index = readings.load_index()
suggestions = open('words.txt', 'r').readlines() suggestions = open('words.txt', 'r').readlines()
@ -94,4 +94,4 @@ if __name__ == '__main__':
if not 'index.json' in os.listdir('.'): if not 'index.json' in os.listdir('.'):
tfidf.create_index() tfidf.create_index()
APP.debug=True APP.debug=True
APP.run() APP.run(port=5001)

29
cross-reader.tfidf/tfidf.py

@ -37,7 +37,11 @@ def tfidf(query, words, corpus):
return tf_count, idf_count, tfidf_value return tf_count, idf_count, tfidf_value
def get_language(document): def get_language(document):
language = re.search(r'\[.*\]', document, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower() match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
if match:
language = match.group().replace('[','').replace(']','').lower()
else:
language = 'undefined'
return language return language
def load_text_files(): def load_text_files():
@ -59,6 +63,7 @@ def load_text_files():
sentences[document] = s sentences[document] = s
files.append(document) # list of filenames files.append(document) # list of filenames
print('---------')
print('*txt files loaded*') print('*txt files loaded*')
return files, corpus, sentences, wordlists return files, corpus, sentences, wordlists
@ -72,21 +77,28 @@ def create_index():
# index = { # index = {
# Fem document : { # Fem document : {
# 'tfidf' : { # 'sentences' : [],
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'tf' : { # 'tf' : {
# 'aap': 4, # 'aap': 4,
# 'beer': 6, # 'beer': 6,
# 'citroen': 2 # 'citroen': 2
# }, # },
# 'idf' : {
# 'aap': 2,
# 'beer': 1,
# 'citroen': 5
# },
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'name': 'Feminist document (2000)', # 'name': 'Feminist document (2000)',
# 'language': 'en' # 'language': 'en'
# } # }
# } # }
for document in files: for document in files:
print('---------') print('---------')
print('document:', document) print('document:', document)
@ -104,12 +116,15 @@ def create_index():
if 'tfidf' not in index[document]: if 'tfidf' not in index[document]:
index[document]['tfidf'] = {} index[document]['tfidf'] = {}
index[document]['tfidf'][word] = tfidf_value index[document]['tfidf'][word] = tfidf_value
index[document]['language'] = get_language(document)
index[document]['name'] = make_human_readable_name(document) index[document]['name'] = make_human_readable_name(document)
with open('index.json','w+') as out: with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True)) out.write(json.dumps(index, indent=4, sort_keys=True))
out.close() out.close()
print('---------')
print('*index created*') print('*index created*')
print('---------')
# create_index() # create_index()

2
cross-reader.tfidf/words.txt

@ -1,2 +1,2 @@
lorum lorem
ipsum ipsum

Loading…
Cancel
Save