PushingScores/textedit.py


								#!/usr/bin/env python

								# -*- coding: utf-8 -*-

								import sys, os

								from nltk import sent_tokenize, word_tokenize

								from nltk import everygrams

								from nltk import FreqDist

								import json

								import re


								"""

								PART 1

								We create the dictionary and save it.

								"""


								stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"]


								path = "static/files/"

								for path, subdirs, files in os.walk(path):

								    for name in files:

								        if name.endswith('html'):

								            file = os.path.join(path, name)

								            total = open("allhtml.txt", "a")

								            with open(file) as f:

								                content = f.read()

								                total.write(content)

								            total.close()


								keyword_list = []


								# with open('allhtml.txt') as f:

								#     content = f.read()

								#     tokens = word_tokenize(content)

								#     tokens = [token for token in tokens if token not in stopws]

								#     freq_file=FreqDist(tokens)

								#     print(tokens)

								#     keyword_list.append(freq_file.most_common(50))

								#     print(keyword_list[0])


								with open('allhtml.txt') as f:

								    content = f.read()

								    tokens = word_tokenize(content)

								    tokens = [token for token in tokens if token not in stopws]

								    keyword_list = list(set(tokens))

								    # print(tokens)

								    # print(keyword_list)


								"""

								PART 2

								We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.

								"""


								# wordlist = {}

								# avoiding_repetition = []


								sentences_w_word = {}


								def analysis(the_word, file_name):

								    id = file_name[13:15]

								    with open(file_name) as f:

								        content = f.read()

								    sent_tokens = sent_tokenize(content)

								    new_sent_tokens = []

								    for sent_token in sent_tokens:

								        if the_word in sent_token:

								            new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")})

								    if the_word in sentences_w_word: # if this is not the first iteration

								        previous_sent_tokens = sentences_w_word[the_word]

								        full_sent_tokens = previous_sent_tokens + new_sent_tokens

								    else:

								        full_sent_tokens = new_sent_tokens

								    sentences_w_word[word] = full_sent_tokens


								path = "static/files/"

								for path, subdirs, files in os.walk(path):

								    for name in files:

								        if name.endswith('html'):

								            file = os.path.join(path, name)

								            for word in keyword_list:

								                analysis(word, file)


								with open('wordlist.json', 'w', encoding="utf8") as outfile:

								    json.dump(sentences_w_word, outfile, ensure_ascii=False)


								# def analysis(file, id):

								#     sent_tokens = sent_tokenize(file) # sentence tokenizing

								#     for sent_token in sent_tokens:

								#         tokens = word_tokenize(sent_token) # word tokenizing

								#         print(tokens)

								#         for token in tokens:

								#             for first in keyword_list:

								#                 if token == first: # if token is in keyword_list

								#                     if token not in wordlist:

								#                         wordlist[token] = []

								#                         sent_dict = {}

								#                         sent_dict["id"]=id

								#                         sent_dict["sentence"] = sent_token.replace('\n', ' ')

								#                         wordlist[token].append(sent_dict)

								#                     elif token not in avoiding_repetition:

								#                         # print(wordlist[token])

								#                         sent_dict = {}

								#                         sent_dict["id"]=id

								#                         sent_dict["sentence"] = sent_token.replace('\n', ' ')

								#                         wordlist[token].append(sent_dict)

								#                         avoiding_repetition.append(token)


								# with open('static/files/17/17.blurb.html') as f:

								#     content = f.read()

								#     analysis(content, '17')


								# # reading each individual html file

								# path = "static/files/"

								# for path, subdirs, files in os.walk(path):

								#     for name in files:

								#         if name.endswith('html'):

								#             file = os.path.join(path, name)

								#             with open(file) as f:

								#                 content = f.read()

								#             id=name[:2]

								#             analysis(content, id)


								# json_wordlist = json.dumps(wordlist)

								# for item in wordlist:

								#     for item2 in wordlist[item]:

								#         print(item)

								#         print(item2["sentence"])

								#         print("\n")