changes to fix bugs in generate_links
This commit is contained in:
parent
ff882b1e08
commit
5ebd19852c
@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
|
|||||||
file = os.path.join(path, name)
|
file = os.path.join(path, name)
|
||||||
with open(file, 'r+', encoding="utf-8") as f:
|
with open(file, 'r+', encoding="utf-8") as f:
|
||||||
textfile = f.read()
|
textfile = f.read()
|
||||||
for word in wordlist_dict:
|
words = re.compile("([\w-]+)").split(textfile)
|
||||||
word = re.escape(word)
|
words_to_search = wordlist_dict.keys()
|
||||||
textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
|
for i, word in enumerate(words):
|
||||||
|
if word in words_to_search:
|
||||||
|
words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
|
||||||
|
|
||||||
|
textfile = "".join(words)
|
||||||
|
# for word in wordlist_dict:
|
||||||
|
# word = re.escape(word)
|
||||||
|
# textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
|
||||||
f.truncate(0)
|
f.truncate(0)
|
||||||
f.write(textfile)
|
f.write(textfile)
|
||||||
f.truncate()
|
f.truncate()
|
||||||
|
|
||||||
# print(textfile)
|
print(textfile)
|
||||||
|
20
textedit.py
20
textedit.py
@ -1,16 +1,17 @@
|
|||||||
import sys, os
|
import sys, os
|
||||||
from nltk import sent_tokenize, word_tokenize
|
from nltk import sent_tokenize
|
||||||
from nltk import everygrams
|
from nltk import everygrams
|
||||||
from nltk import FreqDist
|
from nltk import FreqDist
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
"""
|
"""
|
||||||
PART 1
|
PART 1
|
||||||
We create the dictionary and save it.
|
We create the dictionary and save it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "—", "Ca", "M", "Q", "A"]
|
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
|
||||||
|
|
||||||
path = "static/files/"
|
path = "static/files/"
|
||||||
for path, subdirs, files in os.walk(path):
|
for path, subdirs, files in os.walk(path):
|
||||||
@ -28,7 +29,9 @@ keyword_list = []
|
|||||||
|
|
||||||
with open('allhtml.txt') as f:
|
with open('allhtml.txt') as f:
|
||||||
content = f.read()
|
content = f.read()
|
||||||
tokens = word_tokenize(content)
|
# tokens = word_tokenize(content)
|
||||||
|
tokens = re.compile("(?!-)[\W]+").split(content)
|
||||||
|
tokens.remove("")
|
||||||
tokens = [token for token in tokens if token not in stopws]
|
tokens = [token for token in tokens if token not in stopws]
|
||||||
keyword_list = list(set(tokens))
|
keyword_list = list(set(tokens))
|
||||||
|
|
||||||
@ -45,15 +48,6 @@ def analysis(the_word, file_name):
|
|||||||
content = f.read()
|
content = f.read()
|
||||||
sent_tokens = sent_tokenize(content)
|
sent_tokens = sent_tokenize(content)
|
||||||
new_sent_tokens = []
|
new_sent_tokens = []
|
||||||
if the_word[0]=="'":
|
|
||||||
the_word = the_word.replace(the_word[0], "")
|
|
||||||
for i in range(0, len(the_word)):
|
|
||||||
if the_word.endswith("."):
|
|
||||||
the_word = the_word[0:-1]
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
if the_word=="Timidity++":
|
|
||||||
the_word = the_word = the_word[0:-2]
|
|
||||||
re_word = r"\b" + re.escape(the_word) + r"\b"
|
re_word = r"\b" + re.escape(the_word) + r"\b"
|
||||||
for sent_token in sent_tokens:
|
for sent_token in sent_tokens:
|
||||||
if re.search(re_word, sent_token):
|
if re.search(re_word, sent_token):
|
||||||
@ -67,7 +61,7 @@ def analysis(the_word, file_name):
|
|||||||
|
|
||||||
|
|
||||||
path = "static/files/"
|
path = "static/files/"
|
||||||
for path, subdirs, files in os.walk(path):
|
for path, subdirs, files in tqdm(os.walk(path)):
|
||||||
for name in files:
|
for name in files:
|
||||||
if name.endswith('html'):
|
if name.endswith('html'):
|
||||||
file = os.path.join(path, name)
|
file = os.path.join(path, name)
|
||||||
|
Loading…
Reference in New Issue
Block a user