PushingScores/textedit.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
from nltk import sent_tokenize, word_tokenize
from nltk import everygrams
from nltk import FreqDist
import json
import re

"""
PART 1
We create the dictionary and save it.
"""

stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"]

path = "static/files/"
for path, subdirs, files in os.walk(path):
    for name in files:
        if name.endswith('html'):
            file = os.path.join(path, name)
            total = open("allhtml.txt", "a")
            with open(file) as f:
                content = f.read()
                total.write(content)
            total.close()

keyword_list = []

# with open('allhtml.txt') as f:
#     content = f.read()
#     tokens = word_tokenize(content)
#     tokens = [token for token in tokens if token not in stopws]
#     freq_file=FreqDist(tokens)
#     print(tokens)
#     keyword_list.append(freq_file.most_common(50))
#     print(keyword_list[0])

with open('allhtml.txt') as f:
    content = f.read()
    tokens = word_tokenize(content)
    tokens = [token for token in tokens if token not in stopws]
    keyword_list = list(set(tokens))
    # print(tokens)
    # print(keyword_list)

"""
PART 2
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
"""

# wordlist = {}
# avoiding_repetition = []


sentences_w_word = {}

def analysis(the_word, file_name):
    id = file_name[13:15]
    with open(file_name) as f:
        content = f.read()
    sent_tokens = sent_tokenize(content)
    new_sent_tokens = []
    for sent_token in sent_tokens:
        if the_word in sent_token:
            new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")})
    if the_word in sentences_w_word: # if this is not the first iteration
        previous_sent_tokens = sentences_w_word[the_word]
        full_sent_tokens = previous_sent_tokens + new_sent_tokens
    else:
        full_sent_tokens = new_sent_tokens
    sentences_w_word[word] = full_sent_tokens


path = "static/files/"
for path, subdirs, files in os.walk(path):
    for name in files:
        if name.endswith('html'):
            file = os.path.join(path, name)
            for word in keyword_list:
                analysis(word, file)

with open('wordlist.json', 'w', encoding="utf8") as outfile:
    json.dump(sentences_w_word, outfile, ensure_ascii=False)


# def analysis(file, id):
#     sent_tokens = sent_tokenize(file) # sentence tokenizing
#     for sent_token in sent_tokens:
#         tokens = word_tokenize(sent_token) # word tokenizing
#         print(tokens)
#         for token in tokens:
#             for first in keyword_list:
#                 if token == first: # if token is in keyword_list
#                     if token not in wordlist:
#                         wordlist[token] = []
#                         sent_dict = {}
#                         sent_dict["id"]=id
#                         sent_dict["sentence"] = sent_token.replace('\n', ' ')
#                         wordlist[token].append(sent_dict)
#                     elif token not in avoiding_repetition:
#                         # print(wordlist[token])
#                         sent_dict = {}
#                         sent_dict["id"]=id
#                         sent_dict["sentence"] = sent_token.replace('\n', ' ')
#                         wordlist[token].append(sent_dict)
#                         avoiding_repetition.append(token)


# with open('static/files/17/17.blurb.html') as f:
#     content = f.read()
#     analysis(content, '17')


# # reading each individual html file
# path = "static/files/"
# for path, subdirs, files in os.walk(path):
#     for name in files:
#         if name.endswith('html'):
#             file = os.path.join(path, name)
#             with open(file) as f:
#                 content = f.read()
#             id=name[:2]
#             analysis(content, id)

# json_wordlist = json.dumps(wordlist)
# for item in wordlist:
#     for item2 in wordlist[item]:
#         print(item)
#         print(item2["sentence"])
#         print("\n")
so many changes 6 years ago			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
added the new python file 6 years ago			`import sys, os`
so many changes 6 years ago			`from nltk import sent_tokenize, word_tokenize`
			`from nltk import everygrams`
			`from nltk import FreqDist`
			`import json`
			`import re`
added the new python file 6 years ago
so many changes 6 years ago			`"""`
			`PART 1`
			`We create the dictionary and save it.`
			`"""`
added the new python file 6 years ago
so many changes 6 years ago			stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"]
added the new python file 6 years ago
			`path = "static/files/"`
			`for path, subdirs, files in os.walk(path):`
			`for name in files:`
			`if name.endswith('html'):`
			`file = os.path.join(path, name)`
so many changes 6 years ago			`total = open("allhtml.txt", "a")`
added the new python file 6 years ago			`with open(file) as f:`
			`content = f.read()`
so many changes 6 years ago			`total.write(content)`
			`total.close()`

			`keyword_list = []`

			`# with open('allhtml.txt') as f:`
			`# content = f.read()`
			`# tokens = word_tokenize(content)`
			`# tokens = [token for token in tokens if token not in stopws]`
			`# freq_file=FreqDist(tokens)`
			`# print(tokens)`
			`# keyword_list.append(freq_file.most_common(50))`
			`# print(keyword_list[0])`

			`with open('allhtml.txt') as f:`
			`content = f.read()`
			`tokens = word_tokenize(content)`
			`tokens = [token for token in tokens if token not in stopws]`
			`keyword_list = list(set(tokens))`
			`# print(tokens)`
			`# print(keyword_list)`

			`"""`
			`PART 2`
			`We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.`
			`"""`

			`# wordlist = {}`
			`# avoiding_repetition = []`


			`sentences_w_word = {}`

			`def analysis(the_word, file_name):`
			`id = file_name[13:15]`
			`with open(file_name) as f:`
			`content = f.read()`
			`sent_tokens = sent_tokenize(content)`
			`new_sent_tokens = []`
			`for sent_token in sent_tokens:`
			`if the_word in sent_token:`
			`new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")})`
			`if the_word in sentences_w_word: # if this is not the first iteration`
			`previous_sent_tokens = sentences_w_word[the_word]`
			`full_sent_tokens = previous_sent_tokens + new_sent_tokens`
			`else:`
			`full_sent_tokens = new_sent_tokens`
			`sentences_w_word[word] = full_sent_tokens`



			`path = "static/files/"`
			`for path, subdirs, files in os.walk(path):`
			`for name in files:`
			`if name.endswith('html'):`
			`file = os.path.join(path, name)`
			`for word in keyword_list:`
			`analysis(word, file)`

			`with open('wordlist.json', 'w', encoding="utf8") as outfile:`
			`json.dump(sentences_w_word, outfile, ensure_ascii=False)`


			`# def analysis(file, id):`
			`# sent_tokens = sent_tokenize(file) # sentence tokenizing`
			`# for sent_token in sent_tokens:`
			`# tokens = word_tokenize(sent_token) # word tokenizing`
			`# print(tokens)`
			`# for token in tokens:`
			`# for first in keyword_list:`
			`# if token == first: # if token is in keyword_list`
			`# if token not in wordlist:`
			`# wordlist[token] = []`
			`# sent_dict = {}`
			`# sent_dict["id"]=id`
			`# sent_dict["sentence"] = sent_token.replace('\n', ' ')`
			`# wordlist[token].append(sent_dict)`
			`# elif token not in avoiding_repetition:`
			`# # print(wordlist[token])`
			`# sent_dict = {}`
			`# sent_dict["id"]=id`
			`# sent_dict["sentence"] = sent_token.replace('\n', ' ')`
			`# wordlist[token].append(sent_dict)`
			`# avoiding_repetition.append(token)`


			`# with open('static/files/17/17.blurb.html') as f:`
			`# content = f.read()`
			`# analysis(content, '17')`


			`# # reading each individual html file`
			`# path = "static/files/"`
			`# for path, subdirs, files in os.walk(path):`
			`# for name in files:`
			`# if name.endswith('html'):`
			`# file = os.path.join(path, name)`
			`# with open(file) as f:`
			`# content = f.read()`
			`# id=name[:2]`
			`# analysis(content, id)`

			`# json_wordlist = json.dumps(wordlist)`
			`# for item in wordlist:`
			`# for item2 in wordlist[item]:`
			`# print(item)`
			`# print(item2["sentence"])`
			`# print("\n")`