import os, json from flask import Markup import tfidf from nltk import tokenize, pos_tag, RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def load_index(): if os.path.isfile('index.json') == False: tfidf.create_index() f = open('index.json').read() index = json.loads(f) return index def get_random(x, y): from random import randint return randint(x, y) def generate_random_rgb(): r = get_random(0, 255) g = get_random(0, 255) b = get_random(0, 255) return r, g, b def get_pos(): # --- # Note: NLTK provides documentation for each tag, # which can be queried using the tag, e.g. # nltk.help.upenn_tagset('RB'), or a regular expression, # e.g. nltk.help.upenn_tagset('NN.*'). Some corpora # have README files with tagset documentation, # see nltk.corpus.???.readme(), substituting in the name # of the corpus. -- http://www.nltk.org/book/ch05.html # --- # data { # 'word' : { # 'count' : 8, # 'sentences' : { # 'filename' : [ # 'This is a sentence.', # 'This is another sentence.' # ] # } # } # } index = load_index() sentences_all = [index[document]['sentences'] for document, _ in index.items()] data = {} data['ADJ'] = {} data['PRE'] = {} filenames = [filename for filename, _ in index.items()] # print(filenames) for i, sentences in enumerate(sentences_all): r, g, b = generate_random_rgb() for sentence in sentences: pos = pos_tag(tokenizer.tokenize(sentence)) # print(pos) for word, tag in pos: if 'JJ' in tag: # --- # JJ: adjective or numeral, ordinal # For example: third ill-mannered pre-war regrettable oiled calamitous first separable # ectoplasmic battery-powered participatory fourth still-to-be-named # multilingual multi-disciplinary ... if word.lower() not in data['ADJ']: data['ADJ'][word.lower()] = {} if 'sentences' not in data['ADJ'][word.lower()]: data['ADJ'][word.lower()]['sentences'] = {} if filenames[i] not in data['ADJ'][word.lower()].keys(): data['ADJ'][word.lower()]['sentences'][filenames[i]] = [] s = Markup(sentence.replace(word, '{word}'.format(r=r, b=b, g=g, word=word))) if s not in data['ADJ'][word.lower()]['sentences'][filenames[i]]: data['ADJ'][word.lower()]['sentences'][filenames[i]].append(s) if 'TO' in tag or 'IN' in tag: # --- # TO: "to" as preposition (voorzetsel) or infinitive marker (oneindige beïnvloeder?) # For example: to # --- # IN: preposition or conjunction (voegwoord, samenstelling, verbinding), subordinating (ondergeschikt) # For example: astride among uppon whether out inside pro despite on by throughout # below within for towards near behind atop around if like until below # next into if beside ... # --- if word.lower() not in data['PRE']: data['PRE'][word.lower()] = {} if 'sentences' not in data['PRE'][word.lower()]: data['PRE'][word.lower()]['sentences'] = {} if filenames[i] not in data['PRE'][word.lower()]['sentences'].keys(): data['PRE'][word.lower()]['sentences'][filenames[i]] = [] s = Markup(sentence.replace(word, '{word}'.format(r=r, b=b, g=g, word=word))) if s not in data['PRE'][word.lower()]['sentences'][filenames[i]]: data['PRE'][word.lower()]['sentences'][filenames[i]].append(s) # count number of results for each word for word_type, words in data.items(): for word, _ in words.items(): # print(filenames) count = 0 for filename, sentences in data[word_type][word]['sentences'].items(): # print(filename) count += len(sentences) data[word_type][word.lower()]['count'] = count count = 0 data_sorted = {} for word_type, words in data.items(): tmp = [] for word, _ in words.items(): count = data[word_type][word.lower()]['count'] tmp.append([count, word]) i = 0 tmp.sort(reverse=True) print('tmp', tmp) for count, word in tmp: if word_type not in data_sorted: data_sorted[word_type] = {} data_sorted[word_type][i] = {} data_sorted[word_type][i][word.lower()] = data[word_type][word.lower()] i += 1 print(data_sorted) return data_sorted, index # data, index = get_pos() # pp.pprint(data)