annotation tools for making the iterations publication (Manetta & Jara) - https://iterations.space/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
131 lines
4.5 KiB
131 lines
4.5 KiB
import os, json
|
|
from flask import Markup
|
|
|
|
import tfidf
|
|
|
|
from nltk import tokenize, pos_tag, RegexpTokenizer
|
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
|
|
|
import pprint
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
def load_index():
|
|
if os.path.isfile('index.json') == False:
|
|
tfidf.create_index()
|
|
f = open('index.json').read()
|
|
index = json.loads(f)
|
|
return index
|
|
|
|
def get_random(x, y):
|
|
from random import randint
|
|
return randint(x, y)
|
|
|
|
def generate_random_rgb():
|
|
r = get_random(0, 255)
|
|
g = get_random(0, 255)
|
|
b = get_random(0, 255)
|
|
return r, g, b
|
|
|
|
def get_pos():
|
|
# ---
|
|
# Note: NLTK provides documentation for each tag,
|
|
# which can be queried using the tag, e.g.
|
|
# nltk.help.upenn_tagset('RB'), or a regular expression,
|
|
# e.g. nltk.help.upenn_tagset('NN.*'). Some corpora
|
|
# have README files with tagset documentation,
|
|
# see nltk.corpus.???.readme(), substituting in the name
|
|
# of the corpus. -- http://www.nltk.org/book/ch05.html
|
|
# ---
|
|
|
|
# data {
|
|
# 'word' : {
|
|
# 'count' : 8,
|
|
# 'sentences' : {
|
|
# 'filename' : [
|
|
# 'This is a sentence.',
|
|
# 'This is another sentence.'
|
|
# ]
|
|
# }
|
|
# }
|
|
# }
|
|
|
|
index = load_index()
|
|
sentences_all = [index[document]['sentences'] for document, _ in index.items()]
|
|
data = {}
|
|
data['ADJ'] = {}
|
|
data['PRE'] = {}
|
|
filenames = [filename for filename, _ in index.items()]
|
|
# print(filenames)
|
|
for i, sentences in enumerate(sentences_all):
|
|
r, g, b = generate_random_rgb()
|
|
for sentence in sentences:
|
|
pos = pos_tag(tokenizer.tokenize(sentence))
|
|
# print(pos)
|
|
for word, tag in pos:
|
|
if 'JJ' in tag:
|
|
# ---
|
|
# JJ: adjective or numeral, ordinal
|
|
# For example: third ill-mannered pre-war regrettable oiled calamitous first separable
|
|
# ectoplasmic battery-powered participatory fourth still-to-be-named
|
|
# multilingual multi-disciplinary ...
|
|
if word.lower() not in data['ADJ']:
|
|
data['ADJ'][word.lower()] = {}
|
|
if 'sentences' not in data['ADJ'][word.lower()]:
|
|
data['ADJ'][word.lower()]['sentences'] = {}
|
|
if filenames[i] not in data['ADJ'][word.lower()].keys():
|
|
data['ADJ'][word.lower()]['sentences'][filenames[i]] = []
|
|
s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
|
|
if s not in data['ADJ'][word.lower()]['sentences'][filenames[i]]:
|
|
data['ADJ'][word.lower()]['sentences'][filenames[i]].append(s)
|
|
if 'TO' in tag or 'IN' in tag:
|
|
# ---
|
|
# TO: "to" as preposition (voorzetsel) or infinitive marker (oneindige beïnvloeder?)
|
|
# For example: to
|
|
# ---
|
|
# IN: preposition or conjunction (voegwoord, samenstelling, verbinding), subordinating (ondergeschikt)
|
|
# For example: astride among uppon whether out inside pro despite on by throughout
|
|
# below within for towards near behind atop around if like until below
|
|
# next into if beside ...
|
|
# ---
|
|
if word.lower() not in data['PRE']:
|
|
data['PRE'][word.lower()] = {}
|
|
if 'sentences' not in data['PRE'][word.lower()]:
|
|
data['PRE'][word.lower()]['sentences'] = {}
|
|
if filenames[i] not in data['PRE'][word.lower()]['sentences'].keys():
|
|
data['PRE'][word.lower()]['sentences'][filenames[i]] = []
|
|
s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))
|
|
if s not in data['PRE'][word.lower()]['sentences'][filenames[i]]:
|
|
data['PRE'][word.lower()]['sentences'][filenames[i]].append(s)
|
|
|
|
# count number of results for each word
|
|
for word_type, words in data.items():
|
|
for word, _ in words.items():
|
|
# print(filenames)
|
|
count = 0
|
|
for filename, sentences in data[word_type][word]['sentences'].items():
|
|
# print(filename)
|
|
count += len(sentences)
|
|
data[word_type][word.lower()]['count'] = count
|
|
count = 0
|
|
|
|
data_sorted = {}
|
|
for word_type, words in data.items():
|
|
tmp = []
|
|
for word, _ in words.items():
|
|
count = data[word_type][word.lower()]['count']
|
|
tmp.append([count, word])
|
|
i = 0
|
|
tmp.sort(reverse=True)
|
|
print('tmp', tmp)
|
|
for count, word in tmp:
|
|
if word_type not in data_sorted:
|
|
data_sorted[word_type] = {}
|
|
data_sorted[word_type][i] = {}
|
|
data_sorted[word_type][i][word.lower()] = data[word_type][word.lower()]
|
|
i += 1
|
|
|
|
print(data_sorted)
|
|
return data_sorted, index
|
|
|
|
# data, index = get_pos()
|
|
# pp.pprint(data)
|