cross-reader/readings.py

201 lines
5.8 KiB
Python

import os, json, re
from flask import Markup
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
import tfidf
# TF-IDF visualisation multiplier
multiplier = 25000
def load_index():
if os.path.isfile('index.json') == False:
tfidf.create_index()
f = open('index.json').read()
index = json.loads(f)
return index
def get_random(x, y):
from random import randint
return randint(x, y)
def generate_random_rgb():
r = get_random(0, 255)
g = get_random(0, 255)
b = get_random(0, 255)
return r, g, b
def request_mappings_all():
index = load_index()
filenames = [manifesto for manifesto, _ in index.items()]
mappings = {}
for manifesto, _ in index.items():
words = []
for sentence in index[manifesto]['sentences']:
for word in tokenizer.tokenize(sentence):
tfidf = index[manifesto]['tfidf'][word] * multiplier
if [tfidf, word] not in words:
words.append([tfidf, word])
words.sort(reverse=True)
mappings[manifesto] = words
# pp.pprint(mappings)
return mappings
def request_mappings(name):
index = load_index()
filenames = [manifesto for manifesto, _ in index.items()]
mappings = {}
for manifesto, _ in index.items():
if manifesto == name:
sentences = []
for sentence in index[manifesto]['sentences']:
words = []
for word in tokenizer.tokenize(sentence):
tfidf = index[manifesto]['tfidf'][word] * multiplier
words.append([word, tfidf])
sentences.append(words)
mappings[manifesto] = sentences
# pp.pprint(mappings)
return mappings, filenames
def insert_query_highlight(query, sentence, r, g, b):
pattern = r'[\s\W\_]'+query+r'[\s\W\_]|^'+query+'|'+query+'$'
match = re.search(pattern, sentence, flags=re.IGNORECASE)
if match:
match = match.group()
sentence = re.sub(pattern, ' <strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{match}</strong> '.format(match=match, r=r, b=b, g=g), sentence, flags=re.IGNORECASE)
return sentence
def insert_suggestion_links(query, sentence):
# insert further reading links
for suggestion in open('words.txt','r').readlines():
suggestion = suggestion.replace('\n', '').strip()
if suggestion:
if suggestion != query:
pattern = r'[\s\W\_]'+suggestion+r'[\s\W\_]|^'+suggestion+'|'+suggestion+'$'
match = re.search(pattern, sentence, flags=re.IGNORECASE)
if match:
match = match.group()
match = match.replace(suggestion, '<a href="?q={0}">{0}</a>'.format(suggestion))
sentence = re.sub(pattern, '<strong>{}</strong>'.format(match), sentence, flags=re.IGNORECASE)
return sentence
def generate_analytics(query, results, index):
analytics = {}
mappings = request_mappings_all()
for manifesto, items in mappings.items():
if manifesto == results[0]['filename']:
analytics['mappings'] = mappings[manifesto]
# Stemmer (very similar words)
analytics['stemmer'] = []
porter = nltk.PorterStemmer()
basequery = porter.stem(query)
for manifesto, _ in index.items():
words = index[manifesto]['tfidf'].keys()
bases = [[porter.stem(word), word] for word in words]
# print('Stemmer bases', bases)
for base, word in bases:
if base == basequery:
analytics['stemmer'].append(word)
analytics['stemmer'] = set(analytics['stemmer'])
if query in analytics['stemmer']:
analytics['stemmer'].remove(query)
# print('Stemmer:', matches)
print('*analytics information returned*')
# pp.pprint(analytics)
return analytics
def request_results(query):
print('\n*results request started*')
query = query.strip().lower()
print('Query:', query)
index = load_index()
filenames = [document for document, _ in index.items()]
results = {}
# results = {
# 0 : {
# 'name' : 'Feminist manifesto (2000)',
# 'filename' : '2000_Feminist_manifesto',
# 'tfidf' : 0.00041,
# 'matches' : [
# 'This is a first matching sentence.',
# 'This is a second matching sentence.',
# 'This is a third matching sentence.'
# ]
# }
# }
# First, sort the matching manifestos on TF-IDF values
order = []
for manifesto, _ in index.items():
for key in index[manifesto]['tfidf'].keys():
if query == key.lower():
# print('Query match:', query)
match = [index[manifesto]['tfidf'][key], manifesto]
order.append(match)
break
order.sort(reverse=True)
# print('Order:', order)
# Loop through the sorted matches
# and add all the data that is needed
# (sentences, tfidf value, manifesto name)
x = 0
for tfidf, manifesto in order:
# print('\n---', manifesto, '---')
results[x] = {}
results[x]['name'] = index[manifesto]['name'] # nicely readable name
results[x]['filename'] = manifesto
results[x]['tfidf'] = tfidf
results[x]['matches'] = []
results[x]['html'] = []
# Generate a random RGB color for this manifesto
r, g, b = generate_random_rgb()
# All sentences from this manifesto
sentences = index[manifesto]['sentences']
# Collect matching sentences only
for sentence in sentences:
for word in tokenizer.tokenize(sentence):
if word.lower() == query:
# Append sentence to final set of matching results
results[x]['matches'].append(sentence)
# print('Matching sentence:', sentence.replace('\n', ' '))
# Transform sentence into an HTML elements
html = insert_query_highlight(query, sentence, r, g, b)
html = insert_suggestion_links(query, html)
html = Markup(html)
results[x]['html'].append(html)
break # Append sentence only once
x += 1
# pp.pprint(results)
print('\n*results returned*')
# Add analytics
if results.keys():
analytics = generate_analytics(query, results, index)
else:
analytics = False
# pp.pprint(analytics)
return results, filenames, analytics