You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
220 lines
7.0 KiB
220 lines
7.0 KiB
import os, json, re
|
|
from flask import Markup
|
|
|
|
import nltk
|
|
from nltk import sent_tokenize
|
|
from nltk.tokenize import RegexpTokenizer
|
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
|
|
|
import pprint
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
import tfidf
|
|
|
|
|
|
# TF-IDF visualisation multiplier
|
|
multiplier = 25000
|
|
|
|
|
|
def load_index():
|
|
if os.path.isfile('index.json') == False:
|
|
tfidf.create_index()
|
|
f = open('index.json').read()
|
|
index = json.loads(f)
|
|
return index
|
|
|
|
def get_random(x, y):
|
|
from random import randint
|
|
return randint(x, y)
|
|
|
|
def generate_random_rgb():
|
|
r = get_random(0, 255)
|
|
g = get_random(0, 255)
|
|
b = get_random(0, 255)
|
|
return r, g, b
|
|
|
|
def insert_query_highlight(query, tfidf, sentence, r, g, b):
|
|
pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$'
|
|
match = re.search(pattern, sentence, flags=re.IGNORECASE)
|
|
if match:
|
|
match = match.group().replace(' ', '')
|
|
sentence = re.sub(pattern, ' <strong class="query" style="font-size:{tfidf}%;color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(tfidf=tfidf, match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)
|
|
return sentence
|
|
|
|
def insert_suggestion_links(query, sentence):
|
|
# insert further reading links
|
|
for suggestion in open('words.txt','r').readlines():
|
|
suggestion = suggestion.replace('\n', '').strip()
|
|
if suggestion:
|
|
if suggestion != query:
|
|
pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$'
|
|
match = re.search(pattern, sentence, flags=re.IGNORECASE)
|
|
if match:
|
|
match = match.group()
|
|
match = match.replace(suggestion, '<a href="?q={0}">{0}</a>'.format(suggestion))
|
|
sentence = re.sub(pattern, ' <strong>{}</strong> '.format(match), sentence, flags=re.IGNORECASE)
|
|
return sentence
|
|
|
|
def get_adjectives():
|
|
index = load_index()
|
|
sentences_all = [sentences for sentences in document['sentences'] for document, _ in index.items()]
|
|
adjectives = []
|
|
for sentences in sentences_all:
|
|
for sentence in sentences:
|
|
pos = nltk.pos_tag(words)
|
|
|
|
return adjectives
|
|
|
|
def generate_analytics(query, results, index):
|
|
analytics = {}
|
|
querypos = nltk.pos_tag([query])
|
|
analytics['type'] = querypos[0][1]
|
|
|
|
# Contrast-mapping
|
|
# analytics['mapping'] = []
|
|
# if results[0]['matches']:
|
|
# for word in tokenizer.tokenize(results[0]['matches'][0]):
|
|
# document = results[0]['filename']
|
|
# analytics['mapping'].append([word, index[document]['tfidf'][word.lower()] * multiplier]) # lowercased! (!important)
|
|
|
|
# Stemmer (very similar words)
|
|
analytics['stemmer'] = []
|
|
porter = nltk.PorterStemmer()
|
|
basequery = porter.stem(query)
|
|
for document, _ in index.items():
|
|
words = index[document]['tfidf'].keys()
|
|
bases = [[porter.stem(word), word] for word in words]
|
|
# print('Stemmer bases', bases)
|
|
for base, word in bases:
|
|
if base == basequery:
|
|
analytics['stemmer'].append(word)
|
|
analytics['stemmer'] = set(analytics['stemmer'])
|
|
if query in analytics['stemmer']:
|
|
analytics['stemmer'].remove(query)
|
|
# print('Stemmer:', matches)
|
|
|
|
print('*analytics information returned*')
|
|
# pp.pprint(analytics)
|
|
return analytics
|
|
|
|
def request_results(query):
|
|
print('*results request started*')
|
|
query = query.strip().lower()
|
|
print('Query:', query)
|
|
index = load_index()
|
|
filenames = [document for document, _ in index.items()]
|
|
|
|
results = {}
|
|
|
|
# results = {
|
|
# 0 : {
|
|
# 'name' : 'Feminist document (2000)',
|
|
# 'filename' : '2000_Feminist_document',
|
|
# 'tfidf' : 0.00041,
|
|
# 'matches' : [
|
|
# 'This is a first matching sentence.',
|
|
# 'This is a second matching sentence.',
|
|
# 'This is a third matching sentence.'
|
|
# ]
|
|
# }
|
|
# }
|
|
|
|
# First, check which documents use the query
|
|
order = []
|
|
for document, _ in index.items():
|
|
for key in index[document]['tfidf'].keys():
|
|
if query == key.strip().lower():
|
|
print('Query match:', query)
|
|
match = (index[document]['tfidf'][key.lower()], document) # lowercased! (!important)
|
|
order.append(match)
|
|
break
|
|
order.sort(reverse=True)
|
|
print('Order:', order)
|
|
|
|
# Loop through the sorted matches
|
|
# and add all the data that is needed
|
|
# (sentences, tfidf value, document name)
|
|
x = 0
|
|
for tfidf, document in order:
|
|
# print('document:', document)
|
|
results[x] = {}
|
|
results[x]['name'] = index[document]['name'] # nicely readable name
|
|
results[x]['filename'] = document
|
|
results[x]['tfidf'] = tfidf
|
|
results[x]['matches'] = []
|
|
results[x]['html'] = []
|
|
|
|
# Generate a random RGB color for this document
|
|
r, g, b = generate_random_rgb()
|
|
|
|
# All sentences from this document
|
|
sentences = index[document]['sentences']
|
|
|
|
# Collect matching sentences only
|
|
for sentence in sentences:
|
|
for word in tokenizer.tokenize(sentence):
|
|
if word.lower() == query:
|
|
|
|
# Append sentence to final set of matching results
|
|
results[x]['matches'].append(sentence)
|
|
|
|
# Transform sentence into an HTML elements
|
|
html = insert_query_highlight(query.strip(), 100 + (tfidf * multiplier), sentence, r, g, b)
|
|
html = insert_suggestion_links(query, html)
|
|
html = Markup(html)
|
|
results[x]['html'].append(html)
|
|
|
|
break # Append sentence only once
|
|
x += 1
|
|
|
|
pp.pprint(results)
|
|
print('*results returned*')
|
|
|
|
# Add analytics
|
|
if results.keys():
|
|
analytics = generate_analytics(query, results, index)
|
|
else:
|
|
analytics = False
|
|
# pp.pprint(analytics)
|
|
|
|
return filenames, results, analytics
|
|
|
|
def request_mappings(mapping_type):
|
|
index = load_index()
|
|
filenames = [document for document, _ in index.items()]
|
|
mappings = []
|
|
for document, _ in index.items():
|
|
sentences = []
|
|
for sentence in index[document]['sentences']:
|
|
for word in tokenizer.tokenize(sentence):
|
|
if mapping_type == 'tfidf' or mapping_type == 'tfidf-mapping':
|
|
tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)
|
|
if [tfidf, word.lower()] not in mappings: # lowercased! (!important)
|
|
mappings.append([tfidf, word.lower()]) # lowercased! (!important)
|
|
# if mapping_type == 'tf':
|
|
# tf = index[document]['tf'][word.lower()] # lowercased! (!important)
|
|
# if [tf, word.lower()] not in mappings: # lowercased! (!important)
|
|
# mappings.append([tf, word.lower()]) # lowercased! (!important)
|
|
if mapping_type == 'idf':
|
|
idf = index[document]['idf'][word.lower()] # lowercased! (!important)
|
|
if [idf, word.lower()] not in mappings: # lowercased! (!important)
|
|
mappings.append([idf, word.lower()]) # lowercased! (!important)
|
|
mappings.sort(reverse=True)
|
|
return mappings, filenames
|
|
|
|
def request_mappings_for_document(name):
|
|
index = load_index()
|
|
filenames = [document for document, _ in index.items()]
|
|
mappings = {}
|
|
for document, _ in index.items():
|
|
if document == name:
|
|
sentences = []
|
|
for sentence in index[document]['sentences']:
|
|
words = []
|
|
for word in tokenizer.tokenize(sentence):
|
|
tfidf = index[document]['tfidf'][word.lower()] * multiplier # lowercased! (!important)
|
|
words.append([word, tfidf])
|
|
sentences.append(words)
|
|
mappings[document] = sentences
|
|
# pp.pprint(mappings)
|
|
return mappings, filenames
|