2019-05-04 16:27:50 +02:00
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# -*- coding: utf-8 -*-
|
2019-04-17 18:36:12 +02:00
|
|
|
|
import sys, os
|
2019-05-04 16:27:50 +02:00
|
|
|
|
from nltk import sent_tokenize, word_tokenize
|
|
|
|
|
from nltk import everygrams
|
|
|
|
|
from nltk import FreqDist
|
|
|
|
|
import json
|
|
|
|
|
import re
|
2019-04-17 18:36:12 +02:00
|
|
|
|
|
2019-05-04 16:27:50 +02:00
|
|
|
|
"""
|
|
|
|
|
PART 1
|
|
|
|
|
We create the dictionary and save it.
|
|
|
|
|
"""
|
2019-04-17 18:36:12 +02:00
|
|
|
|
|
2019-05-04 16:27:50 +02:00
|
|
|
|
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"]
|
2019-04-17 18:36:12 +02:00
|
|
|
|
|
|
|
|
|
path = "static/files/"
|
|
|
|
|
for path, subdirs, files in os.walk(path):
|
|
|
|
|
for name in files:
|
|
|
|
|
if name.endswith('html'):
|
|
|
|
|
file = os.path.join(path, name)
|
2019-05-04 16:27:50 +02:00
|
|
|
|
total = open("allhtml.txt", "a")
|
2019-04-17 18:36:12 +02:00
|
|
|
|
with open(file) as f:
|
|
|
|
|
content = f.read()
|
2019-05-04 16:27:50 +02:00
|
|
|
|
total.write(content)
|
|
|
|
|
total.close()
|
|
|
|
|
|
|
|
|
|
keyword_list = []
|
|
|
|
|
|
|
|
|
|
# with open('allhtml.txt') as f:
|
|
|
|
|
# content = f.read()
|
|
|
|
|
# tokens = word_tokenize(content)
|
|
|
|
|
# tokens = [token for token in tokens if token not in stopws]
|
|
|
|
|
# freq_file=FreqDist(tokens)
|
|
|
|
|
# print(tokens)
|
|
|
|
|
# keyword_list.append(freq_file.most_common(50))
|
|
|
|
|
# print(keyword_list[0])
|
|
|
|
|
|
|
|
|
|
with open('allhtml.txt') as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
tokens = word_tokenize(content)
|
|
|
|
|
tokens = [token for token in tokens if token not in stopws]
|
|
|
|
|
keyword_list = list(set(tokens))
|
|
|
|
|
# print(tokens)
|
|
|
|
|
# print(keyword_list)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
PART 2
|
|
|
|
|
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# wordlist = {}
|
|
|
|
|
# avoiding_repetition = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sentences_w_word = {}
|
|
|
|
|
|
|
|
|
|
def analysis(the_word, file_name):
|
|
|
|
|
id = file_name[13:15]
|
|
|
|
|
with open(file_name) as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
sent_tokens = sent_tokenize(content)
|
|
|
|
|
new_sent_tokens = []
|
|
|
|
|
for sent_token in sent_tokens:
|
|
|
|
|
if the_word in sent_token:
|
|
|
|
|
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")})
|
|
|
|
|
if the_word in sentences_w_word: # if this is not the first iteration
|
|
|
|
|
previous_sent_tokens = sentences_w_word[the_word]
|
|
|
|
|
full_sent_tokens = previous_sent_tokens + new_sent_tokens
|
|
|
|
|
else:
|
|
|
|
|
full_sent_tokens = new_sent_tokens
|
|
|
|
|
sentences_w_word[word] = full_sent_tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
path = "static/files/"
|
|
|
|
|
for path, subdirs, files in os.walk(path):
|
|
|
|
|
for name in files:
|
|
|
|
|
if name.endswith('html'):
|
|
|
|
|
file = os.path.join(path, name)
|
|
|
|
|
for word in keyword_list:
|
|
|
|
|
analysis(word, file)
|
|
|
|
|
|
|
|
|
|
with open('wordlist.json', 'w', encoding="utf8") as outfile:
|
|
|
|
|
json.dump(sentences_w_word, outfile, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def analysis(file, id):
|
|
|
|
|
# sent_tokens = sent_tokenize(file) # sentence tokenizing
|
|
|
|
|
# for sent_token in sent_tokens:
|
|
|
|
|
# tokens = word_tokenize(sent_token) # word tokenizing
|
|
|
|
|
# print(tokens)
|
|
|
|
|
# for token in tokens:
|
|
|
|
|
# for first in keyword_list:
|
|
|
|
|
# if token == first: # if token is in keyword_list
|
|
|
|
|
# if token not in wordlist:
|
|
|
|
|
# wordlist[token] = []
|
|
|
|
|
# sent_dict = {}
|
|
|
|
|
# sent_dict["id"]=id
|
|
|
|
|
# sent_dict["sentence"] = sent_token.replace('\n', ' ')
|
|
|
|
|
# wordlist[token].append(sent_dict)
|
|
|
|
|
# elif token not in avoiding_repetition:
|
|
|
|
|
# # print(wordlist[token])
|
|
|
|
|
# sent_dict = {}
|
|
|
|
|
# sent_dict["id"]=id
|
|
|
|
|
# sent_dict["sentence"] = sent_token.replace('\n', ' ')
|
|
|
|
|
# wordlist[token].append(sent_dict)
|
|
|
|
|
# avoiding_repetition.append(token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# with open('static/files/17/17.blurb.html') as f:
|
|
|
|
|
# content = f.read()
|
|
|
|
|
# analysis(content, '17')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # reading each individual html file
|
|
|
|
|
# path = "static/files/"
|
|
|
|
|
# for path, subdirs, files in os.walk(path):
|
|
|
|
|
# for name in files:
|
|
|
|
|
# if name.endswith('html'):
|
|
|
|
|
# file = os.path.join(path, name)
|
|
|
|
|
# with open(file) as f:
|
|
|
|
|
# content = f.read()
|
|
|
|
|
# id=name[:2]
|
|
|
|
|
# analysis(content, id)
|
|
|
|
|
|
|
|
|
|
# json_wordlist = json.dumps(wordlist)
|
|
|
|
|
# for item in wordlist:
|
|
|
|
|
# for item2 in wordlist[item]:
|
|
|
|
|
# print(item)
|
|
|
|
|
# print(item2["sentence"])
|
|
|
|
|
# print("\n")
|