various cross-reading prototypes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.2 KiB

import os, json, re
import nltk
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def load_text_files():
files = []
corpus = []
sentences = {}
wordlists = {}
dir = 'txt'
for document in sorted(os.listdir(dir)):
document = document.replace('.txt','')
lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file
lines = lines.replace('', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
s = sent_tokenize(lines)
sentences[document] = s
files.append(document) # list of filenames
print('*txt files loaded*')
return files, sentences
def create_index():
files, sentences = load_text_files()
index = {}
# index = {
# document : {
# 'sentences' : []
# }
for document in files:
print('---------')
print('document:', document)
index[document] = {}
index[document]['sentences'] = sentences[document]
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
out.close()
print('*index created*')
# create_index()