Browse Source

changes to fix bugs in generate_links

master
Cristina Cochior 5 years ago
parent
commit
5ebd19852c
  1. 15
      generate_links.py
  2. 20
      textedit.py

15
generate_links.py

@ -13,11 +13,18 @@ for path, subdirs, files in os.walk(path):
file = os.path.join(path, name) file = os.path.join(path, name)
with open(file, 'r+', encoding="utf-8") as f: with open(file, 'r+', encoding="utf-8") as f:
textfile = f.read() textfile = f.read()
for word in wordlist_dict: words = re.compile("([\w-]+)").split(textfile)
word = re.escape(word) words_to_search = wordlist_dict.keys()
textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile) for i, word in enumerate(words):
if word in words_to_search:
words[i] = "<a href='/diverge?search={}'>{}</a>".format(word, word)
textfile = "".join(words)
# for word in wordlist_dict:
# word = re.escape(word)
# textfile = re.sub(r"(?<!<)(?<!</)(?<!ge\?)\b(%s)\b" %word, r"<a href='/diverge?search=\1'>\1</a>", textfile)
f.truncate(0) f.truncate(0)
f.write(textfile) f.write(textfile)
f.truncate() f.truncate()
# print(textfile) print(textfile)

20
textedit.py

@ -1,16 +1,17 @@
import sys, os import sys, os
from nltk import sent_tokenize, word_tokenize from nltk import sent_tokenize
from nltk import everygrams from nltk import everygrams
from nltk import FreqDist from nltk import FreqDist
import json import json
import re import re
from tqdm import tqdm
""" """
PART 1 PART 1
We create the dictionary and save it. We create the dictionary and save it.
""" """
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S", "", "Ca", "M", "Q", "A"] stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","","", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
path = "static/files/" path = "static/files/"
for path, subdirs, files in os.walk(path): for path, subdirs, files in os.walk(path):
@ -28,7 +29,9 @@ keyword_list = []
with open('allhtml.txt') as f: with open('allhtml.txt') as f:
content = f.read() content = f.read()
tokens = word_tokenize(content) # tokens = word_tokenize(content)
tokens = re.compile("(?!-)[\W]+").split(content)
tokens.remove("")
tokens = [token for token in tokens if token not in stopws] tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens)) keyword_list = list(set(tokens))
@ -45,15 +48,6 @@ def analysis(the_word, file_name):
content = f.read() content = f.read()
sent_tokens = sent_tokenize(content) sent_tokens = sent_tokenize(content)
new_sent_tokens = [] new_sent_tokens = []
if the_word[0]=="'":
the_word = the_word.replace(the_word[0], "")
for i in range(0, len(the_word)):
if the_word.endswith("."):
the_word = the_word[0:-1]
else:
break
if the_word=="Timidity++":
the_word = the_word = the_word[0:-2]
re_word = r"\b" + re.escape(the_word) + r"\b" re_word = r"\b" + re.escape(the_word) + r"\b"
for sent_token in sent_tokens: for sent_token in sent_tokens:
if re.search(re_word, sent_token): if re.search(re_word, sent_token):
@ -67,7 +61,7 @@ def analysis(the_word, file_name):
path = "static/files/" path = "static/files/"
for path, subdirs, files in os.walk(path): for path, subdirs, files in tqdm(os.walk(path)):
for name in files: for name in files:
if name.endswith('html'): if name.endswith('html'):
file = os.path.join(path, name) file = os.path.join(path, name)

Loading…
Cancel
Save