You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.2 KiB
49 lines
1.2 KiB
import os, json, re
|
|
|
|
import nltk
|
|
from nltk import sent_tokenize
|
|
from nltk.tokenize import RegexpTokenizer
|
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
|
|
|
import pprint
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
def load_text_files():
|
|
files = []
|
|
corpus = []
|
|
sentences = {}
|
|
wordlists = {}
|
|
dir = 'txt'
|
|
|
|
for document in sorted(os.listdir(dir)):
|
|
document = document.replace('.txt','')
|
|
lines = open('{}/{}.txt'.format(dir, document), "r").read() # list of lines in .txt file
|
|
lines = lines.replace(' •', '. ') # turn custom linebreaks into full-stops to let the tokenizer recognize them as end-of-lines
|
|
s = sent_tokenize(lines)
|
|
sentences[document] = s
|
|
files.append(document) # list of filenames
|
|
|
|
print('*txt files loaded*')
|
|
return files, sentences
|
|
|
|
def create_index():
|
|
files, sentences = load_text_files()
|
|
index = {}
|
|
|
|
# index = {
|
|
# document : {
|
|
# 'sentences' : []
|
|
# }
|
|
|
|
for document in files:
|
|
print('---------')
|
|
print('document:', document)
|
|
index[document] = {}
|
|
index[document]['sentences'] = sentences[document]
|
|
|
|
with open('index.json','w+') as out:
|
|
out.write(json.dumps(index, indent=4, sort_keys=True))
|
|
out.close()
|
|
print('*index created*')
|
|
|
|
# create_index()
|