cross-readers/cross-reader.tfidf/tfidf.py

import os, json, re
from math import log, exp

import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer

import pprint
pp = pprint.PrettyPrinter(indent=4)

def tfidf(query, words, corpus):
	# Term Frequency
	tf_count = 0
	for word in words:
		if query == word:
			tf_count += 1
	tf = tf_count/len(words)
	# print('TF count:', tf_count)
	# print('Total number of words:', len(words))
	# print('TF - count/total', tf_count/len(words))

	# Inverse Document Frequency
	idf_count = 0
	for words in corpus:
		if query in words:
			idf_count += 1
	# print('count:', idf_count)
	idf = log(len(corpus)/idf_count)
	# print('Total number of documents:', len(corpus))
	# print('documents/count', len(corpus)/idf_count)
	# print('IDF - log(documents/count)', log(len(corpus)/idf_count))

	tfidf_value = tf * idf
	# print('TF-IDF:', tfidf_value)

	return tf, idf_count, tfidf_value 

def get_language(document):
	match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)
	if match:
		language = match.group().replace('[','').replace(']','').lower()
	else:
		language = 'undefined'
	return language

def load_text_files():
	files = []
	corpus = []
	sentences = {}
	wordlists = {}
	dir = 'txt'

	for document in sorted(os.listdir(dir)):
		document = document.replace('.txt','')
		# print('document:', document)
		lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file
		lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
		words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important)
		wordlists[document] = words
		corpus.append(words) 
		s = sent_tokenize(lines)
		sentences[document] = s
		files.append(document) # list of filenames

	print('---------')
	print('*txt files loaded*')
	return files, corpus, sentences, wordlists

def make_human_readable_name(document):
	name = document.replace('_', ' ').replace('-', ' ')
	return name

def create_index():
	files, corpus, sentences, wordlists = load_text_files()
	index = {}

	# index = {
	# 	Fem document : {
	# 		'sentences' : [],
	# 		'tf' : {
	# 			'aap': 4,
	# 			'beer': 6,
	# 			'citroen': 2
	# 		},
	# 		'idf' : {
	# 			'aap': 2,
	# 			'beer': 1,
	# 			'citroen': 5
	# 		},
	# 		'tfidf' : {
	# 			'aap': 39.2,
	# 			'beer': 20.456,
	# 			'citroen': 3.21
	# 		},
	#		'name': 'Feminist document (2000)',
	# 		'language': 'en'
	# 	}
	# }


	for document in files:
		print('---------')
		print('document:', document)
		index[document] = {}
		index[document]['sentences'] = sentences[document]
		words = wordlists[document]
		for word in words:
			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
			if 'tf' not in index[document]:
				index[document]['tf'] = {}
			index[document]['tf'][word] = tf_count
			if 'idf' not in index[document]:
				index[document]['idf'] = {}
			index[document]['idf'][word] = idf_count
			if 'tfidf' not in index[document]:
				index[document]['tfidf'] = {}
			index[document]['tfidf'][word] = tfidf_value
			index[document]['language'] = get_language(document)

		index[document]['name'] = make_human_readable_name(document)

	with open('index.json','w+') as out:
		out.write(json.dumps(index, indent=4, sort_keys=True))
		out.close()
	print('---------')
	print('*index created*')
	print('---------')

# create_index()
first commit with 2 cross-reading prototypes 6 years ago			`import os, json, re`
			`from math import log, exp`

			`import nltk`
			`from nltk import sent_tokenize`
			`from nltk.tokenize import RegexpTokenizer`
			`tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer`

			`import pprint`
			`pp = pprint.PrettyPrinter(indent=4)`

			`def tfidf(query, words, corpus):`
			`# Term Frequency`
			`tf_count = 0`
			`for word in words:`
			`if query == word:`
			`tf_count += 1`
			`tf = tf_count/len(words)`
			`# print('TF count:', tf_count)`
			`# print('Total number of words:', len(words))`
			`# print('TF - count/total', tf_count/len(words))`

			`# Inverse Document Frequency`
			`idf_count = 0`
			`for words in corpus:`
			`if query in words:`
			`idf_count += 1`
			`# print('count:', idf_count)`
			`idf = log(len(corpus)/idf_count)`
			`# print('Total number of documents:', len(corpus))`
			`# print('documents/count', len(corpus)/idf_count)`
			`# print('IDF - log(documents/count)', log(len(corpus)/idf_count))`

			`tfidf_value = tf * idf`
			`# print('TF-IDF:', tfidf_value)`

updating the tfidf cross reading prototype, to align it with the workshop version for the Tech Zine Fair in Paris 6 years ago			`return tf, idf_count, tfidf_value`
first commit with 2 cross-reading prototypes 6 years ago
			`def get_language(document):`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`match = re.search(r'\[.*\]', document, flags=re.IGNORECASE)`
			`if match:`
			`language = match.group().replace('[','').replace(']','').lower()`
			`else:`
			`language = 'undefined'`
first commit with 2 cross-reading prototypes 6 years ago			`return language`

			`def load_text_files():`
			`files = []`
			`corpus = []`
			`sentences = {}`
			`wordlists = {}`
			`dir = 'txt'`

			`for document in sorted(os.listdir(dir)):`
			`document = document.replace('.txt','')`
			`# print('document:', document)`
			`lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file`
			`lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines`
			`words = [word.lower() for word in tokenizer.tokenize(lines)] # all words of one document, in reading order + lowercased! (!important)`
			`wordlists[document] = words`
			`corpus.append(words)`
			`s = sent_tokenize(lines)`
			`sentences[document] = s`
			`files.append(document) # list of filenames`

short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`print('---------')`
first commit with 2 cross-reading prototypes 6 years ago			`print('txt files loaded')`
			`return files, corpus, sentences, wordlists`

			`def make_human_readable_name(document):`
			`name = document.replace('_', ' ').replace('-', ' ')`
			`return name`

			`def create_index():`
			`files, corpus, sentences, wordlists = load_text_files()`
			`index = {}`

			`# index = {`
			`# Fem document : {`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`# 'sentences' : [],`
first commit with 2 cross-reading prototypes 6 years ago			`# 'tf' : {`
			`# 'aap': 4,`
			`# 'beer': 6,`
			`# 'citroen': 2`
			`# },`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`# 'idf' : {`
			`# 'aap': 2,`
			`# 'beer': 1,`
			`# 'citroen': 5`
			`# },`
			`# 'tfidf' : {`
			`# 'aap': 39.2,`
			`# 'beer': 20.456,`
			`# 'citroen': 3.21`
			`# },`
first commit with 2 cross-reading prototypes 6 years ago			`# 'name': 'Feminist document (2000)',`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`# 'language': 'en'`
first commit with 2 cross-reading prototypes 6 years ago			`# }`
			`# }`

short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago
first commit with 2 cross-reading prototypes 6 years ago			`for document in files:`
			`print('---------')`
			`print('document:', document)`
			`index[document] = {}`
			`index[document]['sentences'] = sentences[document]`
			`words = wordlists[document]`
			`for word in words:`
			`tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)`
			`if 'tf' not in index[document]:`
			`index[document]['tf'] = {}`
			`index[document]['tf'][word] = tf_count`
			`if 'idf' not in index[document]:`
			`index[document]['idf'] = {}`
			`index[document]['idf'][word] = idf_count`
			`if 'tfidf' not in index[document]:`
			`index[document]['tfidf'] = {}`
			`index[document]['tfidf'][word] = tfidf_value`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`index[document]['language'] = get_language(document)`
first commit with 2 cross-reading prototypes 6 years ago
			`index[document]['name'] = make_human_readable_name(document)`

			`with open('index.json','w+') as out:`
			`out.write(json.dumps(index, indent=4, sort_keys=True))`
			`out.close()`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`print('---------')`
first commit with 2 cross-reading prototypes 6 years ago			`print('index created')`
short round of debugging this prototype, keeping the option in to use multiple languages ... (add [EN] or [NL] or [FR] or any other tag you want in the filename of a document to let the tool search only within that language) 6 years ago			`print('---------')`
first commit with 2 cross-reading prototypes 6 years ago
			`# create_index()`