changes to fix bugs in generate_links

This commit is contained in:
Cristina Cochior 2019-07-12 17:31:34 +02:00
parent ff882b1e08
commit 5ebd19852c
2 changed files with 18 additions and 17 deletions

View File

@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
file = os.path.join(path, name)
with open(file, 'r+', encoding="utf-8") as f:
textfile = f.read()
for word in wordlist_dict:
word = re.escape(word)
textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
words = re.compile("([\w-]+)").split(textfile)
words_to_search = wordlist_dict.keys()
for i, word in enumerate(words):
if word in words_to_search:
words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
textfile = "".join(words)
# for word in wordlist_dict:
# word = re.escape(word)
# textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
f.truncate(0)
f.write(textfile)
f.truncate()
# print(textfile)
print(textfile)

View File

@ -1,16 +1,17 @@
import sys, os
from nltk import sent_tokenize, word_tokenize
from nltk import sent_tokenize
from nltk import everygrams
from nltk import FreqDist
import json
import re
from tqdm import tqdm
"""
PART 1
We create the dictionary and save it.
"""
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "", "Ca", "M", "Q", "A"]
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
path = "static/files/"
for path, subdirs, files in os.walk(path):
@ -28,7 +29,9 @@ keyword_list = []
with open('allhtml.txt') as f:
content = f.read()
tokens = word_tokenize(content)
# tokens = word_tokenize(content)
tokens = re.compile("(?!-)[\W]+").split(content)
tokens.remove("")
tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens))
@ -45,15 +48,6 @@ def analysis(the_word, file_name):
content = f.read()
sent_tokens = sent_tokenize(content)
new_sent_tokens = []
if the_word[0]=="'":
the_word = the_word.replace(the_word[0], "")
for i in range(0, len(the_word)):
if the_word.endswith("."):
the_word = the_word[0:-1]
else:
break
if the_word=="Timidity++":
the_word = the_word = the_word[0:-2]
re_word = r"\b" + re.escape(the_word) + r"\b"
for sent_token in sent_tokens:
if re.search(re_word, sent_token):
@ -67,7 +61,7 @@ def analysis(the_word, file_name):
path = "static/files/"
for path, subdirs, files in os.walk(path):
for path, subdirs, files in tqdm(os.walk(path)):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)