diff --git a/generate_links.py b/generate_links.py
index c39fd89..d422054 100644
--- a/generate_links.py
+++ b/generate_links.py
@@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
file = os.path.join(path, name)
with open(file, 'r+', encoding="utf-8") as f:
textfile = f.read()
- for word in wordlist_dict:
- word = re.escape(word)
- textfile = re.sub(r"(?\1", textfile)
+ words = re.compile("([\w-]+)").split(textfile)
+ words_to_search = wordlist_dict.keys()
+ for i, word in enumerate(words):
+ if word in words_to_search:
+ words[i] = "{}".format(word, word)
+
+ textfile = "".join(words)
+ # for word in wordlist_dict:
+ # word = re.escape(word)
+ # textfile = re.sub(r"(?\1", textfile)
f.truncate(0)
f.write(textfile)
f.truncate()
-# print(textfile)
+print(textfile)
diff --git a/textedit.py b/textedit.py
index dc70951..214d0a6 100644
--- a/textedit.py
+++ b/textedit.py
@@ -1,16 +1,17 @@
import sys, os
-from nltk import sent_tokenize, word_tokenize
+from nltk import sent_tokenize
from nltk import everygrams
from nltk import FreqDist
import json
import re
+from tqdm import tqdm
"""
PART 1
We create the dictionary and save it.
"""
-stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
+stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
path = "static/files/"
for path, subdirs, files in os.walk(path):
@@ -28,7 +29,9 @@ keyword_list = []
with open('allhtml.txt') as f:
content = f.read()
- tokens = word_tokenize(content)
+ # tokens = word_tokenize(content)
+ tokens = re.compile("(?!-)[\W]+").split(content)
+ tokens.remove("")
tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens))
@@ -45,15 +48,6 @@ def analysis(the_word, file_name):
content = f.read()
sent_tokens = sent_tokenize(content)
new_sent_tokens = []
- if the_word[0]=="'":
- the_word = the_word.replace(the_word[0], "")
- for i in range(0, len(the_word)):
- if the_word.endswith("."):
- the_word = the_word[0:-1]
- else:
- break
- if the_word=="Timidity++":
- the_word = the_word = the_word[0:-2]
re_word = r"\b" + re.escape(the_word) + r"\b"
for sent_token in sent_tokens:
if re.search(re_word, sent_token):
@@ -67,7 +61,7 @@ def analysis(the_word, file_name):
path = "static/files/"
-for path, subdirs, files in os.walk(path):
+for path, subdirs, files in tqdm(os.walk(path)):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)