From 1fda8c792570cc26b269dc18a6bdb59b9fb211d0 Mon Sep 17 00:00:00 2001 From: manetta Date: Mon, 15 Apr 2019 12:27:27 +0200 Subject: [PATCH] short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) --- cross-reader.tfidf/readings.py | 4 ++-- cross-reader.tfidf/start.py | 4 ++-- cross-reader.tfidf/tfidf.py | 29 ++++++++++++++++++++++------- cross-reader.tfidf/words.txt | 2 +- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/cross-reader.tfidf/readings.py b/cross-reader.tfidf/readings.py index 0df4679..212c031 100644 --- a/cross-reader.tfidf/readings.py +++ b/cross-reader.tfidf/readings.py @@ -119,11 +119,11 @@ def request_results(query): # } # } - # First, sort the matching documents on TF-IDF values + # First, check which documents use the query order = [] for document, _ in index.items(): for key in index[document]['tfidf'].keys(): - if query == key.lower(): + if query == key.strip().lower(): print('Query match:', query) match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important) order.append(match) diff --git a/cross-reader.tfidf/start.py b/cross-reader.tfidf/start.py index eb1791b..4c47a32 100644 --- a/cross-reader.tfidf/start.py +++ b/cross-reader.tfidf/start.py @@ -49,7 +49,7 @@ def index(): @APP.route('/document/', methods=['GET', 'POST']) def open_document(name): """ - Open document. + Read document. """ index = readings.load_index() suggestions = open('words.txt', 'r').readlines() @@ -94,4 +94,4 @@ if __name__ == '__main__': if not 'index.json' in os.listdir('.'): tfidf.create_index() APP.debug=True - APP.run() \ No newline at end of file + APP.run(port=5001) \ No newline at end of file diff --git a/cross-reader.tfidf/tfidf.py b/cross-reader.tfidf/tfidf.py index a125203..0dff8db 100644 --- a/cross-reader.tfidf/tfidf.py +++ b/cross-reader.tfidf/tfidf.py @@ -37,7 +37,11 @@ def tfidf(query, words, corpus): return tf_count, idf_count, tfidf_value def get_language(document): - language = re.search(r'\[.*\]', document, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower() + match = re.search(r'\[.*\]', document, flags=re.IGNORECASE) + if match: + language = match.group().replace('[','').replace(']','').lower() + else: + language = 'undefined' return language def load_text_files(): @@ -59,6 +63,7 @@ def load_text_files(): sentences[document] = s files.append(document) # list of filenames + print('---------') print('*txt files loaded*') return files, corpus, sentences, wordlists @@ -72,21 +77,28 @@ def create_index(): # index = { # Fem document : { - # 'tfidf' : { - # 'aap': 39.2, - # 'beer': 20.456, - # 'citroen': 3.21 - # }, + # 'sentences' : [], # 'tf' : { # 'aap': 4, # 'beer': 6, # 'citroen': 2 # }, + # 'idf' : { + # 'aap': 2, + # 'beer': 1, + # 'citroen': 5 + # }, + # 'tfidf' : { + # 'aap': 39.2, + # 'beer': 20.456, + # 'citroen': 3.21 + # }, # 'name': 'Feminist document (2000)', - # 'language': 'en' + # 'language': 'en' # } # } + for document in files: print('---------') print('document:', document) @@ -104,12 +116,15 @@ def create_index(): if 'tfidf' not in index[document]: index[document]['tfidf'] = {} index[document]['tfidf'][word] = tfidf_value + index[document]['language'] = get_language(document) index[document]['name'] = make_human_readable_name(document) with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() + print('---------') print('*index created*') + print('---------') # create_index() \ No newline at end of file diff --git a/cross-reader.tfidf/words.txt b/cross-reader.tfidf/words.txt index 280aa26..7187db3 100644 --- a/cross-reader.tfidf/words.txt +++ b/cross-reader.tfidf/words.txt @@ -1,2 +1,2 @@ -lorum +lorem ipsum