new stuff

6 years ago · f4cfa816d2
2 changed files with 11 additions and 12 deletions
--- a/textedit.py
+++ b/textedit.py
@ -10,7 +10,7 @@ PART 1
 We create the dictionary and save it.
 """
-stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/"]
+stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
 path = "static/files/"
 for path, subdirs, files in os.walk(path):
@ -31,18 +31,12 @@ with open('allhtml.txt') as f:
    tokens = word_tokenize(content)
    tokens = [token for token in tokens if token not in stopws]
    keyword_list = list(set(tokens))
    # print(tokens)
    # print(keyword_list)
 """
 PART 2
 We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
 """
 # wordlist = {}
 # avoiding_repetition = []
 sentences_w_word = {}
 def analysis(the_word, file_name):
@ -51,10 +45,16 @@ def analysis(the_word, file_name):
        content = f.read()
    sent_tokens = sent_tokenize(content)
    new_sent_tokens = []
-    # the_word = the_word.lower()
+    if the_word[0]=="'":
        the_word = the_word.replace(the_word[0], "")
    for i in range(0, len(the_word)):
        if the_word.endswith("."):
            the_word = the_word[0:-1]
        else:
            break
    if the_word=="Timidity++":
        the_word = the_word = the_word[0:-2]
    re_word = r"\b" + re.escape(the_word) + r"\b"
    # print(re_word)
    # print(the_word)
    for sent_token in sent_tokens:
        if re.search(re_word, sent_token):
            new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
@ -65,7 +65,6 @@ def analysis(the_word, file_name):
        full_sent_tokens = new_sent_tokens
    sentences_w_word[the_word] = full_sent_tokens
 # maybe ISO-8859-1 instead of utf8??
 path = "static/files/"
 for path, subdirs, files in os.walk(path):
--- a/wordlist.json
+++ b/wordlist.json