changes to fix bugs in generate_links

2019-07-12 17:31:34 +02:00 · 2019-07-12 17:31:34 +02:00 · 5ebd19852c
commit 5ebd19852c
parent ff882b1e08
2 changed files with 18 additions and 17 deletions
--- a/generate_links.py
+++ b/generate_links.py
@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
            file = os.path.join(path, name)
            with open(file, 'r+', encoding="utf-8") as f:
                textfile = f.read()
-                for word in wordlist_dict:
-                    word = re.escape(word)
-                    textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
+                words = re.compile("([\w-]+)").split(textfile)
+                words_to_search = wordlist_dict.keys()
+                for i, word in enumerate(words):
+                    if word in words_to_search:
+                        words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
+
+                textfile = "".join(words)
+                # for word in wordlist_dict:
+                #     word = re.escape(word)
+                #     textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
                f.truncate(0)
                f.write(textfile)
                f.truncate()

-# print(textfile)
+print(textfile)
--- a/textedit.py
+++ b/textedit.py
@ -1,16 +1,17 @@
 import sys, os
-from nltk import sent_tokenize, word_tokenize
+from nltk import sent_tokenize
 from nltk import everygrams
 from nltk import FreqDist
 import json
 import re
+from tqdm import tqdm

 """
 PART 1
 We create the dictionary and save it.
 """

-stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
+stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]

 path = "static/files/"
 for path, subdirs, files in os.walk(path):
@ -28,7 +29,9 @@ keyword_list = []

 with open('allhtml.txt') as f:
    content = f.read()
-    tokens = word_tokenize(content)
+    # tokens = word_tokenize(content)
+    tokens = re.compile("(?!-)[\W]+").split(content)
+    tokens.remove("")
    tokens = [token for token in tokens if token not in stopws]
    keyword_list = list(set(tokens))

@ -45,15 +48,6 @@ def analysis(the_word, file_name):
        content = f.read()
    sent_tokens = sent_tokenize(content)
    new_sent_tokens = []
-    if the_word[0]=="'":
-        the_word = the_word.replace(the_word[0], "")
-    for i in range(0, len(the_word)):
-        if the_word.endswith("."):
-            the_word = the_word[0:-1]
-        else:
-            break
-    if the_word=="Timidity++":
-        the_word = the_word = the_word[0:-2]
    re_word = r"\b" + re.escape(the_word) + r"\b"
    for sent_token in sent_tokens:
        if re.search(re_word, sent_token):
@ -67,7 +61,7 @@ def analysis(the_word, file_name):


 path = "static/files/"
-for path, subdirs, files in os.walk(path):
+for path, subdirs, files in tqdm(os.walk(path)):
    for name in files:
        if name.endswith('html'):
            file = os.path.join(path, name)