Browse Source

short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language)

master
manetta 5 years ago
parent
commit
1fda8c7925
  1. 4
      cross-reader.tfidf/readings.py
  2. 4
      cross-reader.tfidf/start.py
  3. 29
      cross-reader.tfidf/tfidf.py
  4. 2
      cross-reader.tfidf/words.txt

4
cross-reader.tfidf/readings.py

@ -119,11 +119,11 @@ def request_results(query):
# }
# }
# First, sort the matching documents on TF-IDF values
# First, check which documents use the query
order = []
for document, _ in index.items():
for key in index[document]['tfidf'].keys():
if query == key.lower():
if query == key.strip().lower():
print('Query match:', query)
match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
order.append(match)

4
cross-reader.tfidf/start.py

@ -49,7 +49,7 @@ def index():
@APP.route('/document/<name>', methods=['GET', 'POST'])
def open_document(name):
"""
Open document.
Read document.
"""
index = readings.load_index()
suggestions = open('words.txt', 'r').readlines()
@ -94,4 +94,4 @@ if __name__ == '__main__':
if not 'index.json' in os.listdir('.'):
tfidf.create_index()
APP.debug=True
APP.run()
APP.run(port=5001)

29
cross-reader.tfidf/tfidf.py

@ -37,7 +37,11 @@ def tfidf(query, words, corpus):
return tf_count, idf_count, tfidf_value
def get_language(document):
language = re.search(r'\[.*\]', document, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()
match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
if match:
language = match.group().replace('[','').replace(']','').lower()
else:
language = 'undefined'
return language
def load_text_files():
@ -59,6 +63,7 @@ def load_text_files():
sentences[document] = s
files.append(document) # list of filenames
print('---------')
print('*txt files loaded*')
return files, corpus, sentences, wordlists
@ -72,21 +77,28 @@ def create_index():
# index = {
# Fem document : {
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'sentences' : [],
# 'tf' : {
# 'aap': 4,
# 'beer': 6,
# 'citroen': 2
# },
# 'idf' : {
# 'aap': 2,
# 'beer': 1,
# 'citroen': 5
# },
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'name': 'Feminist document (2000)',
# 'language': 'en'
# 'language': 'en'
# }
# }
for document in files:
print('---------')
print('document:', document)
@ -104,12 +116,15 @@ def create_index():
if 'tfidf' not in index[document]:
index[document]['tfidf'] = {}
index[document]['tfidf'][word] = tfidf_value
index[document]['language'] = get_language(document)
index[document]['name'] = make_human_readable_name(document)
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
out.close()
print('---------')
print('*index created*')
print('---------')
# create_index()

2
cross-reader.tfidf/words.txt

@ -1,2 +1,2 @@
lorum
lorem
ipsum

Loading…
Cancel
Save