diff --git a/generate_links.py b/generate_links.py index c39fd89..d422054 100644 --- a/generate_links.py +++ b/generate_links.py @@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path): file = os.path.join(path, name) with open(file, 'r+', encoding="utf-8") as f: textfile = f.read() - for word in wordlist_dict: - word = re.escape(word) - textfile = re.sub(r"(?\1", textfile) + words = re.compile("([\w-]+)").split(textfile) + words_to_search = wordlist_dict.keys() + for i, word in enumerate(words): + if word in words_to_search: + words[i] = "{}".format(word, word) + + textfile = "".join(words) + # for word in wordlist_dict: + # word = re.escape(word) + # textfile = re.sub(r"(?\1", textfile) f.truncate(0) f.write(textfile) f.truncate() -# print(textfile) +print(textfile) diff --git a/textedit.py b/textedit.py index dc70951..214d0a6 100644 --- a/textedit.py +++ b/textedit.py @@ -1,16 +1,17 @@ import sys, os -from nltk import sent_tokenize, word_tokenize +from nltk import sent_tokenize from nltk import everygrams from nltk import FreqDist import json import re +from tqdm import tqdm """ PART 1 We create the dictionary and save it. """ -stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"] +stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"] path = "static/files/" for path, subdirs, files in os.walk(path): @@ -28,7 +29,9 @@ keyword_list = [] with open('allhtml.txt') as f: content = f.read() - tokens = word_tokenize(content) + # tokens = word_tokenize(content) + tokens = re.compile("(?!-)[\W]+").split(content) + tokens.remove("") tokens = [token for token in tokens if token not in stopws] keyword_list = list(set(tokens)) @@ -45,15 +48,6 @@ def analysis(the_word, file_name): content = f.read() sent_tokens = sent_tokenize(content) new_sent_tokens = [] - if the_word[0]=="'": - the_word = the_word.replace(the_word[0], "") - for i in range(0, len(the_word)): - if the_word.endswith("."): - the_word = the_word[0:-1] - else: - break - if the_word=="Timidity++": - the_word = the_word = the_word[0:-2] re_word = r"\b" + re.escape(the_word) + r"\b" for sent_token in sent_tokens: if re.search(re_word, sent_token): @@ -67,7 +61,7 @@ def analysis(the_word, file_name): path = "static/files/" -for path, subdirs, files in os.walk(path): +for path, subdirs, files in tqdm(os.walk(path)): for name in files: if name.endswith('html'): file = os.path.join(path, name)