PushingScores/textedit.py
2019-05-04 16:27:50 +02:00

133 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, os
from nltk import sent_tokenize, word_tokenize
from nltk import everygrams
from nltk import FreqDist
import json
import re
"""
PART 1
We create the dictionary and save it.
"""
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","","''","","-","", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","",""]
path = "static/files/"
for path, subdirs, files in os.walk(path):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)
total = open("allhtml.txt", "a")
with open(file) as f:
content = f.read()
total.write(content)
total.close()
keyword_list = []
# with open('allhtml.txt') as f:
# content = f.read()
# tokens = word_tokenize(content)
# tokens = [token for token in tokens if token not in stopws]
# freq_file=FreqDist(tokens)
# print(tokens)
# keyword_list.append(freq_file.most_common(50))
# print(keyword_list[0])
with open('allhtml.txt') as f:
content = f.read()
tokens = word_tokenize(content)
tokens = [token for token in tokens if token not in stopws]
keyword_list = list(set(tokens))
# print(tokens)
# print(keyword_list)
"""
PART 2
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
"""
# wordlist = {}
# avoiding_repetition = []
sentences_w_word = {}
def analysis(the_word, file_name):
id = file_name[13:15]
with open(file_name) as f:
content = f.read()
sent_tokens = sent_tokenize(content)
new_sent_tokens = []
for sent_token in sent_tokens:
if the_word in sent_token:
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")})
if the_word in sentences_w_word: # if this is not the first iteration
previous_sent_tokens = sentences_w_word[the_word]
full_sent_tokens = previous_sent_tokens + new_sent_tokens
else:
full_sent_tokens = new_sent_tokens
sentences_w_word[word] = full_sent_tokens
path = "static/files/"
for path, subdirs, files in os.walk(path):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)
for word in keyword_list:
analysis(word, file)
with open('wordlist.json', 'w', encoding="utf8") as outfile:
json.dump(sentences_w_word, outfile, ensure_ascii=False)
# def analysis(file, id):
# sent_tokens = sent_tokenize(file) # sentence tokenizing
# for sent_token in sent_tokens:
# tokens = word_tokenize(sent_token) # word tokenizing
# print(tokens)
# for token in tokens:
# for first in keyword_list:
# if token == first: # if token is in keyword_list
# if token not in wordlist:
# wordlist[token] = []
# sent_dict = {}
# sent_dict["id"]=id
# sent_dict["sentence"] = sent_token.replace('\n', ' ')
# wordlist[token].append(sent_dict)
# elif token not in avoiding_repetition:
# # print(wordlist[token])
# sent_dict = {}
# sent_dict["id"]=id
# sent_dict["sentence"] = sent_token.replace('\n', ' ')
# wordlist[token].append(sent_dict)
# avoiding_repetition.append(token)
# with open('static/files/17/17.blurb.html') as f:
# content = f.read()
# analysis(content, '17')
# # reading each individual html file
# path = "static/files/"
# for path, subdirs, files in os.walk(path):
# for name in files:
# if name.endswith('html'):
# file = os.path.join(path, name)
# with open(file) as f:
# content = f.read()
# id=name[:2]
# analysis(content, id)
# json_wordlist = json.dumps(wordlist)
# for item in wordlist:
# for item2 in wordlist[item]:
# print(item)
# print(item2["sentence"])
# print("\n")