iterations-annotation-tools/making-markers/readings.py


								import os, json

								from flask import Markup


								import tfidf


								from nltk import tokenize, pos_tag, RegexpTokenizer

								tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer


								import pprint

								pp = pprint.PrettyPrinter(indent=4)


								def load_index():

									if os.path.isfile('index.json') == False:

										tfidf.create_index()

									f = open('index.json').read()

									index = json.loads(f)

									return index


								def get_random(x, y):

									from random import randint

									return randint(x, y)


								def generate_random_rgb():

									r = get_random(0, 255)

									g = get_random(0, 255)

									b = get_random(0, 255)

									return r, g, b


								def get_pos():

									# ---

									# Note: NLTK provides documentation for each tag,

									# which can be queried using the tag, e.g.

									# nltk.help.upenn_tagset('RB'), or a regular expression,

									# e.g. nltk.help.upenn_tagset('NN.*'). Some corpora

									# have README files with tagset documentation,

									# see nltk.corpus.???.readme(), substituting in the name

									# of the corpus. -- http://www.nltk.org/book/ch05.html

									# ---


									# data {

									# 	'word' : {

									#  		'count' : 8,

									# 		'sentences' : {

									# 			'filename' : [

									# 				'This is a sentence.',

									# 				'This is another sentence.'

									# 			]

									# 		}

									# 	}

									# }


									index = load_index()

									sentences_all = [index[document]['sentences'] for document, _ in index.items()]

									data = {}

									data['ADJ'] = {}

									data['PRE'] = {}

									filenames = [filename for filename, _ in index.items()]

									# print(filenames)

									for i, sentences in enumerate(sentences_all):

										r, g, b = generate_random_rgb()

										for sentence in sentences:

											pos = pos_tag(tokenizer.tokenize(sentence))

											# print(pos)

											for word, tag in pos:

												if 'JJ' in tag:

													# ---

													# JJ: adjective or numeral, ordinal

													# For example: third ill-mannered pre-war regrettable oiled calamitous first separable

													# ectoplasmic battery-powered participatory fourth still-to-be-named

													# multilingual multi-disciplinary ...

													if word.lower() not in data['ADJ']:

														data['ADJ'][word.lower()] = {}

													if 'sentences' not in data['ADJ'][word.lower()]:

														data['ADJ'][word.lower()]['sentences'] = {}

													if filenames[i] not in data['ADJ'][word.lower()].keys():

														data['ADJ'][word.lower()]['sentences'][filenames[i]] = []

													s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))

													if s not in data['ADJ'][word.lower()]['sentences'][filenames[i]]:

														data['ADJ'][word.lower()]['sentences'][filenames[i]].append(s)

												if 'TO' in tag or 'IN' in tag:

													# ---

													# TO: "to" as preposition (voorzetsel) or infinitive marker (oneindige beïnvloeder?)

													# For example: to

													# ---

													# IN: preposition or conjunction (voegwoord, samenstelling, verbinding), subordinating (ondergeschikt)

												    # For example: astride among uppon whether out inside pro despite on by throughout

												    # below within for towards near behind atop around if like until below

												    # next into if beside ...

													# ---

													if word.lower() not in data['PRE']:

														data['PRE'][word.lower()] = {}

													if 'sentences' not in data['PRE'][word.lower()]:

														data['PRE'][word.lower()]['sentences'] = {}

													if filenames[i] not in data['PRE'][word.lower()]['sentences'].keys():

														data['PRE'][word.lower()]['sentences'][filenames[i]] = []

													s = Markup(sentence.replace(word, '<strong class="query" style="color:rgba({r},{g},{b},1); background-image: radial-gradient(ellipse, rgba({r},{g},{b},0.4), rgba({r},{g},{b},0.2), transparent, transparent);">{word}</strong>'.format(r=r, b=b, g=g, word=word)))

													if s not in data['PRE'][word.lower()]['sentences'][filenames[i]]:

														data['PRE'][word.lower()]['sentences'][filenames[i]].append(s)


									# count number of results for each word

									for word_type, words in data.items():

										for word, _ in words.items():

											# print(filenames)

											count = 0

											for filename, sentences in data[word_type][word]['sentences'].items():

												# print(filename)

												count += len(sentences)

											data[word_type][word.lower()]['count'] = count

											count = 0


									data_sorted = {}

									for word_type, words in data.items():

										tmp = []

										for word, _ in words.items():

											count = data[word_type][word.lower()]['count']

											tmp.append([count, word])

											i = 0

										tmp.sort(reverse=True)

										print('tmp', tmp)

										for count, word in tmp:

											if word_type not in data_sorted:

												data_sorted[word_type] = {}

											data_sorted[word_type][i] = {}

											data_sorted[word_type][i][word.lower()] = data[word_type][word.lower()]

											i += 1


									print(data_sorted)

									return data_sorted, index


								# data, index = get_pos()

								# pp.pprint(data)