various cross-reading prototypes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

129 lines
4.5 KiB

import os, json
from flask import Markup
from nltk import tokenize, pos_tag, RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def load_index():
if os.path.isfile('index.json') == False:
tfidf.create_index()
f = open('index.json').read()
index = json.loads(f)
return index
def get_random(x, y):
from random import randint
return randint(x, y)
def generate_random_rgb():
r = get_random(0, 255)
g = get_random(0, 255)
b = get_random(0, 255)
return r, g, b
def get_pos():
# ---
# Note: NLTK provides documentation for each tag,
# which can be queried using the tag, e.g.
# nltk.help.upenn_tagset('RB'), or a regular expression,
# e.g. nltk.help.upenn_tagset('NN.*'). Some corpora
# have README files with tagset documentation,
# see nltk.corpus.???.readme(), substituting in the name
# of the corpus. -- http://www.nltk.org/book/ch05.html
# ---
# data {
# 'word' : {
# 'count' : 8,
# 'sentences' : {
# 'filename' : [
# 'This is a sentence.',
# 'This is another sentence.'
# ]
# }
# }
# }
index = load_index()
sentences_all = [index[document]['sentences'] for document, _ in index.items()]
data = {}
data['ADJ'] = {}
data['PRE'] = {}
filenames = [filename for filename, _ in index.items()]
# print(filenames)
for i, sentences in enumerate(sentences_all):
r, g, b = generate_random_rgb()
for sentence in sentences:
pos = pos_tag(tokenizer.tokenize(sentence))
# print(pos)
for word, tag in pos:
if 'JJ' in tag:
# ---
# JJ: adjective or numeral, ordinal
# For example: third ill-mannered pre-war regrettable oiled calamitous first separable
# ectoplasmic battery-powered participatory fourth still-to-be-named
# multilingual multi-disciplinary ...
if word.lower() not in data['ADJ']:
data['ADJ'][word.lower()] = {}
if 'sentences' not in data['ADJ'][word.lower()]:
data['ADJ'][word.lower()]['sentences'] = {}
if filenames[i] not in data['ADJ'][word.lower()].keys():
data['ADJ'][word.lower()]['sentences'][filenames[i]] = []
s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
if s not in data['ADJ'][word.lower()]['sentences'][filenames[i]]:
data['ADJ'][word.lower()]['sentences'][filenames[i]].append(s)
if 'TO' in tag or 'IN' in tag:
# ---
# TO: "to" as preposition (voorzetsel) or infinitive marker (oneindige beïnvloeder?)
# For example: to
# ---
# IN: preposition or conjunction (voegwoord, samenstelling, verbinding), subordinating (ondergeschikt)
# For example: astride among uppon whether out inside pro despite on by throughout
# below within for towards near behind atop around if like until below
# next into if beside ...
# ---
if word.lower() not in data['PRE']:
data['PRE'][word.lower()] = {}
if 'sentences' not in data['PRE'][word.lower()]:
data['PRE'][word.lower()]['sentences'] = {}
if filenames[i] not in data['PRE'][word.lower()]['sentences'].keys():
data['PRE'][word.lower()]['sentences'][filenames[i]] = []
s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
if s not in data['PRE'][word.lower()]['sentences'][filenames[i]]:
data['PRE'][word.lower()]['sentences'][filenames[i]].append(s)
# count number of results for each word
for word_type, words in data.items():
for word, _ in words.items():
# print(filenames)
count = 0
for filename, sentences in data[word_type][word]['sentences'].items():
# print(filename)
count += len(sentences)
data[word_type][word.lower()]['count'] = count
count = 0
data_sorted = {}
for word_type, words in data.items():
tmp = []
for word, _ in words.items():
count = data[word_type][word.lower()]['count']
tmp.append([count, word])
i = 0
tmp.sort(reverse=True)
print('tmp', tmp)
for count, word in tmp:
if word_type not in data_sorted:
data_sorted[word_type] = {}
data_sorted[word_type][i] = {}
data_sorted[word_type][i][word.lower()] = data[word_type][word.lower()]
i += 1
print(data_sorted)
return data_sorted, index
# data, index = get_pos()
# pp.pprint(data)