76 lines
2.3 KiB
Python
76 lines
2.3 KiB
Python
import sys, os
|
||
from nltk import sent_tokenize, word_tokenize
|
||
from nltk import everygrams
|
||
from nltk import FreqDist
|
||
import json
|
||
import re
|
||
|
||
"""
|
||
PART 1
|
||
We create the dictionary and save it.
|
||
"""
|
||
|
||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"]
|
||
|
||
path = "static/files/"
|
||
for path, subdirs, files in os.walk(path):
|
||
for name in files:
|
||
if name.endswith('html'):
|
||
file = os.path.join(path, name)
|
||
total = open("allhtml.txt", "a")
|
||
with open(file, 'r+') as f:
|
||
content = f.read()
|
||
total.write(content)
|
||
total.close()
|
||
|
||
keyword_list = []
|
||
|
||
|
||
with open('allhtml.txt') as f:
|
||
content = f.read()
|
||
tokens = word_tokenize(content)
|
||
tokens = [token for token in tokens if token not in stopws]
|
||
keyword_list = list(set(tokens))
|
||
# print(tokens)
|
||
# print(keyword_list)
|
||
|
||
"""
|
||
PART 2
|
||
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
|
||
"""
|
||
|
||
# wordlist = {}
|
||
# avoiding_repetition = []
|
||
|
||
|
||
sentences_w_word = {}
|
||
|
||
def analysis(the_word, file_name):
|
||
id = file_name[13:15]
|
||
with open(file_name, 'r+') as f:
|
||
content = f.read()
|
||
sent_tokens = sent_tokenize(content)
|
||
new_sent_tokens = []
|
||
for sent_token in sent_tokens:
|
||
if the_word in sent_token:
|
||
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
|
||
if the_word in sentences_w_word: # if this is not the first iteration
|
||
previous_sent_tokens = sentences_w_word[the_word]
|
||
full_sent_tokens = previous_sent_tokens + new_sent_tokens
|
||
else:
|
||
full_sent_tokens = new_sent_tokens
|
||
sentences_w_word[the_word] = full_sent_tokens
|
||
|
||
# maybe ISO-8859-1 instead of utf8??
|
||
|
||
path = "static/files/"
|
||
for path, subdirs, files in os.walk(path):
|
||
for name in files:
|
||
if name.endswith('html'):
|
||
file = os.path.join(path, name)
|
||
for word in keyword_list:
|
||
analysis(word, file)
|
||
|
||
with open('wordlist.json', 'w') as outfile:
|
||
json.dump(sentences_w_word, outfile, ensure_ascii=False)
|