import os, json, re import nltk from nltk import sent_tokenize from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer import pprint pp = pprint.PrettyPrinter(indent=4) def load_text_files(): files = [] corpus = [] sentences = {} wordlists = {} dir = 'txt' for document in sorted(os.listdir(dir)): document = document.replace('.txt','') lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines s = sent_tokenize(lines) sentences[document] = s files.append(document) # list of filenames print('*txt files loaded*') return files, sentences def create_index(): files, sentences = load_text_files() index = {} # index = { # document : { # 'sentences' : [] # } for document in files: print('---------') print('document:', document) index[document] = {} index[document]['sentences'] = sentences[document] with open('index.json','w+') as out: out.write(json.dumps(index, indent=4, sort_keys=True)) out.close() print('*index created*') # create_index()