changes to fix bugs in generate_links
This commit is contained in:
parent
ff882b1e08
commit
5ebd19852c
@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
|
||||
file = os.path.join(path, name)
|
||||
with open(file, 'r+', encoding="utf-8") as f:
|
||||
textfile = f.read()
|
||||
for word in wordlist_dict:
|
||||
word = re.escape(word)
|
||||
textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
|
||||
words = re.compile("([\w-]+)").split(textfile)
|
||||
words_to_search = wordlist_dict.keys()
|
||||
for i, word in enumerate(words):
|
||||
if word in words_to_search:
|
||||
words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
|
||||
|
||||
textfile = "".join(words)
|
||||
# for word in wordlist_dict:
|
||||
# word = re.escape(word)
|
||||
# textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
|
||||
f.truncate(0)
|
||||
f.write(textfile)
|
||||
f.truncate()
|
||||
|
||||
# print(textfile)
|
||||
print(textfile)
|
||||
|
20
textedit.py
20
textedit.py
@ -1,16 +1,17 @@
|
||||
import sys, os
|
||||
from nltk import sent_tokenize, word_tokenize
|
||||
from nltk import sent_tokenize
|
||||
from nltk import everygrams
|
||||
from nltk import FreqDist
|
||||
import json
|
||||
import re
|
||||
from tqdm import tqdm
|
||||
|
||||
"""
|
||||
PART 1
|
||||
We create the dictionary and save it.
|
||||
"""
|
||||
|
||||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
|
||||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
|
||||
|
||||
path = "static/files/"
|
||||
for path, subdirs, files in os.walk(path):
|
||||
@ -28,7 +29,9 @@ keyword_list = []
|
||||
|
||||
with open('allhtml.txt') as f:
|
||||
content = f.read()
|
||||
tokens = word_tokenize(content)
|
||||
# tokens = word_tokenize(content)
|
||||
tokens = re.compile("(?!-)[\W]+").split(content)
|
||||
tokens.remove("")
|
||||
tokens = [token for token in tokens if token not in stopws]
|
||||
keyword_list = list(set(tokens))
|
||||
|
||||
@ -45,15 +48,6 @@ def analysis(the_word, file_name):
|
||||
content = f.read()
|
||||
sent_tokens = sent_tokenize(content)
|
||||
new_sent_tokens = []
|
||||
if the_word[0]=="'":
|
||||
the_word = the_word.replace(the_word[0], "")
|
||||
for i in range(0, len(the_word)):
|
||||
if the_word.endswith("."):
|
||||
the_word = the_word[0:-1]
|
||||
else:
|
||||
break
|
||||
if the_word=="Timidity++":
|
||||
the_word = the_word = the_word[0:-2]
|
||||
re_word = r"\b" + re.escape(the_word) + r"\b"
|
||||
for sent_token in sent_tokens:
|
||||
if re.search(re_word, sent_token):
|
||||
@ -67,7 +61,7 @@ def analysis(the_word, file_name):
|
||||
|
||||
|
||||
path = "static/files/"
|
||||
for path, subdirs, files in os.walk(path):
|
||||
for path, subdirs, files in tqdm(os.walk(path)):
|
||||
for name in files:
|
||||
if name.endswith('html'):
|
||||
file = os.path.join(path, name)
|
||||
|
Loading…
Reference in New Issue
Block a user