PushingScores/textedit.py

import sys, os
from nltk import sent_tokenize
from nltk import everygrams
from nltk import FreqDist
import json
import re
from tqdm import tqdm

"""
PART 1
We create the dictionary and save it.
"""

stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]

path = "static/files/"
for path, subdirs, files in os.walk(path):
    for name in files:
        if name.endswith('html'):
            file = os.path.join(path, name)
            total = open("allhtml.txt", "a")
            with open(file, 'r+') as f:
                content = f.read()
                total.write(content)
            total.close()

keyword_list = []


with open('allhtml.txt') as f:
    content = f.read()
    # tokens = word_tokenize(content)
    tokens = re.compile("(?!-)[\W]+").split(content)
    tokens.remove("")
    tokens = [token for token in tokens if token not in stopws]
    keyword_list = list(set(tokens))

"""
PART 2
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
"""

sentences_w_word = {}

def analysis(the_word, file_name):
    id = file_name[13:15]
    with open(file_name, 'r+') as f:
        content = f.read()
    sent_tokens = sent_tokenize(content)
    new_sent_tokens = []
    re_word = r"\b" + re.escape(the_word) + r"\b"
    for sent_token in sent_tokens:
        if re.search(re_word, sent_token):
            new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
    if the_word in sentences_w_word: # if this is not the first iteration
        previous_sent_tokens = sentences_w_word[the_word]
        full_sent_tokens = previous_sent_tokens + new_sent_tokens
    else:
        full_sent_tokens = new_sent_tokens
    sentences_w_word[the_word] = full_sent_tokens


path = "static/files/"
for path, subdirs, files in tqdm(os.walk(path)):
    for name in files:
        if name.endswith('html'):
            file = os.path.join(path, name)
            for word in keyword_list:
                analysis(word, file)

with open('wordlist.json', 'w') as outfile:
    json.dump(sentences_w_word, outfile, ensure_ascii=False)
-												added the new python file

											
										
										
											2019-04-17 18:36:12 +02:00
+								import sys, os
-												changes to fix bugs in generate_links

											
										
										
											2019-07-12 17:31:34 +02:00
+								from nltk import sent_tokenize
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								from nltk import everygrams
 								from nltk import FreqDist
 								import json
 								import re
-												changes to fix bugs in generate_links

											
										
										
											2019-07-12 17:31:34 +02:00
+								from tqdm import tqdm
-												added the new python file

											
										
										
											2019-04-17 18:36:12 +02:00
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								"""
 								PART 1
 								We create the dictionary and save it.
 								"""
-												added the new python file

											
										
										
											2019-04-17 18:36:12 +02:00
-												changes to fix bugs in generate_links

											
										
										
											2019-07-12 17:31:34 +02:00
+								stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"]
-												added the new python file

											
										
										
											2019-04-17 18:36:12 +02:00
 								path = "static/files/"
 								for path, subdirs, files in os.walk(path):
 								    for name in files:
 								        if name.endswith('html'):
 								            file = os.path.join(path, name)
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								            total = open("allhtml.txt", "a")
-												attempts to fix bugs

											
										
										
											2019-05-21 11:54:11 +02:00
+								            with open(file, 'r+') as f:
-												added the new python file

											
										
										
											2019-04-17 18:36:12 +02:00
+								                content = f.read()
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								                total.write(content)
 								            total.close()
 								keyword_list = []
 								with open('allhtml.txt') as f:
 								    content = f.read()
-												changes to fix bugs in generate_links

											
										
										
											2019-07-12 17:31:34 +02:00
+								    # tokens = word_tokenize(content)
 								    tokens = re.compile("(?!-)[\W]+").split(content)
 								    tokens.remove("")
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								    tokens = [token for token in tokens if token not in stopws]
 								    keyword_list = list(set(tokens))
 								"""
 								PART 2
 								We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
 								"""
 								sentences_w_word = {}
 								def analysis(the_word, file_name):
 								    id = file_name[13:15]
-												attempts to fix bugs

											
										
										
											2019-05-21 11:54:11 +02:00
+								    with open(file_name, 'r+') as f:
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								        content = f.read()
 								    sent_tokens = sent_tokenize(content)
 								    new_sent_tokens = []
-												fixed bug in textedit

											
										
										
											2019-05-31 02:11:51 +02:00
+								    re_word = r"\b" + re.escape(the_word) + r"\b"
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								    for sent_token in sent_tokens:
-												new changes to textedit 2

											
										
										
											2019-05-31 10:02:01 +02:00
+								        if re.search(re_word, sent_token):
-												attempts to fix bugs

											
										
										
											2019-05-21 11:54:11 +02:00
+								            new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")})
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								    if the_word in sentences_w_word: # if this is not the first iteration
 								        previous_sent_tokens = sentences_w_word[the_word]
 								        full_sent_tokens = previous_sent_tokens + new_sent_tokens
 								    else:
 								        full_sent_tokens = new_sent_tokens
-												attempts to fix bugs

											
										
										
											2019-05-21 11:54:11 +02:00
+								    sentences_w_word[the_word] = full_sent_tokens
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
 								path = "static/files/"
-												changes to fix bugs in generate_links

											
										
										
											2019-07-12 17:31:34 +02:00
+								for path, subdirs, files in tqdm(os.walk(path)):
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								    for name in files:
 								        if name.endswith('html'):
 								            file = os.path.join(path, name)
 								            for word in keyword_list:
 								                analysis(word, file)
-												attempts to fix bugs

											
										
										
											2019-05-21 11:54:11 +02:00
+								with open('wordlist.json', 'w') as outfile:
-												so many changes

											
										
										
											2019-05-04 16:27:50 +02:00
+								    json.dump(sentences_w_word, outfile, ensure_ascii=False)