cyber/technofeminist cross-reader
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

124 lines
3.5 KiB

import os, json, re
from math import log, exp
import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def tfidf(query, words, corpus):
# Term Frequency
tf_count = 0
for word in words:
if query == word:
tf_count += 1
tf = tf_count/len(words)
# print('TF count:', tf_count)
# print('Total number of words:', len(words))
# print('TF - count/total', tf_count/len(words))
# Inverse Document Frequency
idf_count = 0
for words in corpus:
if query in words:
idf_count += 1
# print('count:', idf_count)
idf = log(len(corpus)/idf_count)
# print('Total number of documents:', len(corpus))
# print('documents/count', len(corpus)/idf_count)
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
tfidf_value = tf * idf
# print('TF-IDF:', tfidf_value)
return tf_count, tf_count, tfidf_value
def get_language(manifesto):
language = re.search(r'\[.*\]', manifesto, flags=re.IGNORECASE).group().replace('[','').replace(']','').lower()
return language
def load_text_files():
files = []
corpus = {}
sentences = {}
wordlists = {}
languages = {}
dir = 'txt'
for manifesto in sorted(os.listdir(dir)):
manifesto = manifesto.replace('.txt','')
# print('Manifesto:', manifesto)
language = get_language(manifesto)
if language == 'en+de+nl+fr': # exception for OBN manifesto
language = 'en'
languages[manifesto] = language
# print('Language:', language)
lines = open('{}/{}.txt'.format(dir, manifesto), "r").read() # list of lines in .txt file
lines = lines.replace('', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
words = [word for word in tokenizer.tokenize(lines)] # all words of one manifesto, in reading order
wordlists[manifesto] = words
if not language in corpus.keys():
corpus[language] = []
corpus[language].append(words)
s = sent_tokenize(lines)
sentences[manifesto] = s
files.append(manifesto) # list of filenames
print('\n*txt files loaded*')
return files, corpus, sentences, wordlists, languages
def make_human_readable_name(manifesto):
year = re.match(r'^\d\d\d\d', manifesto).group()
name = manifesto.replace(year, '').replace('_', ' ').replace('-', ' ')
humanreadablename = '{} ({})'.format(name, year)
return humanreadablename
def create_index():
files, corpus, sentences, wordlists, languages = load_text_files()
index = {}
# index = {
# Fem manifesto : {
# 'tfidf' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# },
# 'tf' : {
# 'aap': 4,
# 'beer': 6,
# 'citroen': 2
# },
# 'name': 'Feminist Manifesto (2000)',
# 'language': 'en'
# }
# }
for manifesto in files:
print('---------')
print('Manifesto:', manifesto)
index[manifesto] = {}
index[manifesto]['sentences'] = sentences[manifesto]
language = languages[manifesto]
words = wordlists[manifesto]
for word in words:
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus[language])
if 'tfidf' not in index[manifesto]:
index[manifesto]['tfidf'] = {}
index[manifesto]['tfidf'][word] = tfidf_value
# if 'tf' not in index[manifesto]:
# index[manifesto]['tf'] = {}
# index[manifesto]['tf'][word] = tf_count
index[manifesto]['name'] = make_human_readable_name(manifesto)
index[manifesto]['language'] = language
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
out.close()
print('*index created*')
# create_index()