PushingScores/textedit.py

73 lines
2.4 KiB
Python
Raw Normal View History

2019-04-17 18:36:12 +02:00
import sys, os
2019-07-12 17:31:34 +02:00
from nltk import sent_tokenize
2019-05-04 16:27:50 +02:00
from nltk import everygrams
from nltk import FreqDist
import json
import re
2019-07-12 17:31:34 +02:00
from tqdm import tqdm
2019-04-17 18:36:12 +02:00
2019-05-04 16:27:50 +02:00
"""
PART 1
We create the dictionary and save it.
"""
2019-04-17 18:36:12 +02:00
2019-07-12 17:31:34 +02:00
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
2019-04-17 18:36:12 +02:00
path = "static/files/"
for path, subdirs, files in os.walk(path):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)
2019-05-04 16:27:50 +02:00
total = open("allhtml.txt", "a")
2019-05-21 11:54:11 +02:00
with open(file, 'r+') as f:
2019-04-17 18:36:12 +02:00
content = f.read()
2019-05-04 16:27:50 +02:00
total.write(content)
total.close()
keyword_list = []
with open('allhtml.txt') as f:
content = f.read()
2019-07-12 17:31:34 +02:00
# tokens = word_tokenize(content)
tokens = re.compile("(?!-)[\W]+").split(content)
tokens.remove("")
2019-05-04 16:27:50 +02:00
tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens))
"""
PART 2
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
"""
sentences_w_word = {}
def analysis(the_word, file_name):
id = file_name[13:15]
2019-05-21 11:54:11 +02:00
with open(file_name, 'r+') as f:
2019-05-04 16:27:50 +02:00
content = f.read()
sent_tokens = sent_tokenize(content)
new_sent_tokens = []
2019-05-31 02:11:51 +02:00
re_word = r"\b" + re.escape(the_word) + r"\b"
2019-05-04 16:27:50 +02:00
for sent_token in sent_tokens:
2019-05-31 10:02:01 +02:00
if re.search(re_word, sent_token):
2019-05-21 11:54:11 +02:00
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
2019-05-04 16:27:50 +02:00
if the_word in sentences_w_word: # if this is not the first iteration
previous_sent_tokens = sentences_w_word[the_word]
full_sent_tokens = previous_sent_tokens + new_sent_tokens
else:
full_sent_tokens = new_sent_tokens
2019-05-21 11:54:11 +02:00
sentences_w_word[the_word] = full_sent_tokens
2019-05-04 16:27:50 +02:00
path = "static/files/"
2019-07-12 17:31:34 +02:00
for path, subdirs, files in tqdm(os.walk(path)):
2019-05-04 16:27:50 +02:00
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)
for word in keyword_list:
analysis(word, file)
2019-05-21 11:54:11 +02:00
with open('wordlist.json', 'w') as outfile:
2019-05-04 16:27:50 +02:00
json.dump(sentences_w_word, outfile, ensure_ascii=False)