changes to fix bugs in generate_links

This commit is contained in:
Cristina Cochior 2019-07-12 17:31:34 +02:00
parent ff882b1e08
commit 5ebd19852c
2 changed files with 18 additions and 17 deletions

View File

@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
file = os.path.join(path, name) file = os.path.join(path, name)
with open(file, 'r+', encoding="utf-8") as f: with open(file, 'r+', encoding="utf-8") as f:
textfile = f.read() textfile = f.read()
for word in wordlist_dict: words = re.compile("([\w-]+)").split(textfile)
word = re.escape(word) words_to_search = wordlist_dict.keys()
textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile) for i, word in enumerate(words):
if word in words_to_search:
words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
textfile = "".join(words)
# for word in wordlist_dict:
# word = re.escape(word)
# textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
f.truncate(0) f.truncate(0)
f.write(textfile) f.write(textfile)
f.truncate() f.truncate()
# print(textfile) print(textfile)

View File

@ -1,16 +1,17 @@
import sys, os import sys, os
from nltk import sent_tokenize, word_tokenize from nltk import sent_tokenize
from nltk import everygrams from nltk import everygrams
from nltk import FreqDist from nltk import FreqDist
import json import json
import re import re
from tqdm import tqdm
""" """
PART 1 PART 1
We create the dictionary and save it. We create the dictionary and save it.
""" """
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "", "Ca", "M", "Q", "A"] stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
path = "static/files/" path = "static/files/"
for path, subdirs, files in os.walk(path): for path, subdirs, files in os.walk(path):
@ -28,7 +29,9 @@ keyword_list = []
with open('allhtml.txt') as f: with open('allhtml.txt') as f:
content = f.read() content = f.read()
tokens = word_tokenize(content) # tokens = word_tokenize(content)
tokens = re.compile("(?!-)[\W]+").split(content)
tokens.remove("")
tokens = [token for token in tokens if token not in stopws] tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens)) keyword_list = list(set(tokens))
@ -45,15 +48,6 @@ def analysis(the_word, file_name):
content = f.read() content = f.read()
sent_tokens = sent_tokenize(content) sent_tokens = sent_tokenize(content)
new_sent_tokens = [] new_sent_tokens = []
if the_word[0]=="'":
the_word = the_word.replace(the_word[0], "")
for i in range(0, len(the_word)):
if the_word.endswith("."):
the_word = the_word[0:-1]
else:
break
if the_word=="Timidity++":
the_word = the_word = the_word[0:-2]
re_word = r"\b" + re.escape(the_word) + r"\b" re_word = r"\b" + re.escape(the_word) + r"\b"
for sent_token in sent_tokens: for sent_token in sent_tokens:
if re.search(re_word, sent_token): if re.search(re_word, sent_token):
@ -67,7 +61,7 @@ def analysis(the_word, file_name):
path = "static/files/" path = "static/files/"
for path, subdirs, files in os.walk(path): for path, subdirs, files in tqdm(os.walk(path)):
for name in files: for name in files:
if name.endswith('html'): if name.endswith('html'):
file = os.path.join(path, name) file = os.path.join(path, name)