|
|
@ -1,16 +1,17 @@ |
|
|
|
import sys, os |
|
|
|
from nltk import sent_tokenize, word_tokenize |
|
|
|
from nltk import sent_tokenize |
|
|
|
from nltk import everygrams |
|
|
|
from nltk import FreqDist |
|
|
|
import json |
|
|
|
import re |
|
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
""" |
|
|
|
PART 1 |
|
|
|
We create the dictionary and save it. |
|
|
|
""" |
|
|
|
|
|
|
|
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"] |
|
|
|
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"] |
|
|
|
|
|
|
|
path = "static/files/" |
|
|
|
for path, subdirs, files in os.walk(path): |
|
|
@ -28,7 +29,9 @@ keyword_list = [] |
|
|
|
|
|
|
|
with open('allhtml.txt') as f: |
|
|
|
content = f.read() |
|
|
|
tokens = word_tokenize(content) |
|
|
|
# tokens = word_tokenize(content) |
|
|
|
tokens = re.compile("(?!-)[\W]+").split(content) |
|
|
|
tokens.remove("") |
|
|
|
tokens = [token for token in tokens if token not in stopws] |
|
|
|
keyword_list = list(set(tokens)) |
|
|
|
|
|
|
@ -45,15 +48,6 @@ def analysis(the_word, file_name): |
|
|
|
content = f.read() |
|
|
|
sent_tokens = sent_tokenize(content) |
|
|
|
new_sent_tokens = [] |
|
|
|
if the_word[0]=="'": |
|
|
|
the_word = the_word.replace(the_word[0], "") |
|
|
|
for i in range(0, len(the_word)): |
|
|
|
if the_word.endswith("."): |
|
|
|
the_word = the_word[0:-1] |
|
|
|
else: |
|
|
|
break |
|
|
|
if the_word=="Timidity++": |
|
|
|
the_word = the_word = the_word[0:-2] |
|
|
|
re_word = r"\b" + re.escape(the_word) + r"\b" |
|
|
|
for sent_token in sent_tokens: |
|
|
|
if re.search(re_word, sent_token): |
|
|
@ -67,7 +61,7 @@ def analysis(the_word, file_name): |
|
|
|
|
|
|
|
|
|
|
|
path = "static/files/" |
|
|
|
for path, subdirs, files in os.walk(path): |
|
|
|
for path, subdirs, files in tqdm(os.walk(path)): |
|
|
|
for name in files: |
|
|
|
if name.endswith('html'): |
|
|
|
file = os.path.join(path, name) |
|
|
|