new stuff
This commit is contained in:
parent
20a4c5e4d5
commit
f4cfa816d2
21
textedit.py
21
textedit.py
@ -10,7 +10,7 @@ PART 1
|
|||||||
We create the dictionary and save it.
|
We create the dictionary and save it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/"]
|
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
|
||||||
|
|
||||||
path = "static/files/"
|
path = "static/files/"
|
||||||
for path, subdirs, files in os.walk(path):
|
for path, subdirs, files in os.walk(path):
|
||||||
@ -31,18 +31,12 @@ with open('allhtml.txt') as f:
|
|||||||
tokens = word_tokenize(content)
|
tokens = word_tokenize(content)
|
||||||
tokens = [token for token in tokens if token not in stopws]
|
tokens = [token for token in tokens if token not in stopws]
|
||||||
keyword_list = list(set(tokens))
|
keyword_list = list(set(tokens))
|
||||||
# print(tokens)
|
|
||||||
# print(keyword_list)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
PART 2
|
PART 2
|
||||||
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
|
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# wordlist = {}
|
|
||||||
# avoiding_repetition = []
|
|
||||||
|
|
||||||
|
|
||||||
sentences_w_word = {}
|
sentences_w_word = {}
|
||||||
|
|
||||||
def analysis(the_word, file_name):
|
def analysis(the_word, file_name):
|
||||||
@ -51,10 +45,16 @@ def analysis(the_word, file_name):
|
|||||||
content = f.read()
|
content = f.read()
|
||||||
sent_tokens = sent_tokenize(content)
|
sent_tokens = sent_tokenize(content)
|
||||||
new_sent_tokens = []
|
new_sent_tokens = []
|
||||||
# the_word = the_word.lower()
|
if the_word[0]=="'":
|
||||||
|
the_word = the_word.replace(the_word[0], "")
|
||||||
|
for i in range(0, len(the_word)):
|
||||||
|
if the_word.endswith("."):
|
||||||
|
the_word = the_word[0:-1]
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
if the_word=="Timidity++":
|
||||||
|
the_word = the_word = the_word[0:-2]
|
||||||
re_word = r"\b" + re.escape(the_word) + r"\b"
|
re_word = r"\b" + re.escape(the_word) + r"\b"
|
||||||
# print(re_word)
|
|
||||||
# print(the_word)
|
|
||||||
for sent_token in sent_tokens:
|
for sent_token in sent_tokens:
|
||||||
if re.search(re_word, sent_token):
|
if re.search(re_word, sent_token):
|
||||||
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
|
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
|
||||||
@ -65,7 +65,6 @@ def analysis(the_word, file_name):
|
|||||||
full_sent_tokens = new_sent_tokens
|
full_sent_tokens = new_sent_tokens
|
||||||
sentences_w_word[the_word] = full_sent_tokens
|
sentences_w_word[the_word] = full_sent_tokens
|
||||||
|
|
||||||
# maybe ISO-8859-1 instead of utf8??
|
|
||||||
|
|
||||||
path = "static/files/"
|
path = "static/files/"
|
||||||
for path, subdirs, files in os.walk(path):
|
for path, subdirs, files in os.walk(path):
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user