#!/usr/bin/env python # -*- coding: utf-8 -*- import sys, os from nltk import sent_tokenize, word_tokenize from nltk import everygrams from nltk import FreqDist import json import re """ PART 1 We create the dictionary and save it. """ stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"] path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) total = open("allhtml.txt", "a") with open(file) as f: content = f.read() total.write(content) total.close() keyword_list = [] # with open('allhtml.txt') as f: # content = f.read() # tokens = word_tokenize(content) # tokens = [token for token in tokens if token not in stopws] # freq_file=FreqDist(tokens) # print(tokens) # keyword_list.append(freq_file.most_common(50)) # print(keyword_list[0]) with open('allhtml.txt') as f: content = f.read() tokens = word_tokenize(content) tokens = [token for token in tokens if token not in stopws] keyword_list = list(set(tokens)) # print(tokens) # print(keyword_list) """ PART 2 We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file. """ # wordlist = {} # avoiding_repetition = [] sentences_w_word = {} def analysis(the_word, file_name): id = file_name[13:15] with open(file_name) as f: content = f.read() sent_tokens = sent_tokenize(content) new_sent_tokens = [] for sent_token in sent_tokens: if the_word in sent_token: new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")}) if the_word in sentences_w_word: # if this is not the first iteration previous_sent_tokens = sentences_w_word[the_word] full_sent_tokens = previous_sent_tokens + new_sent_tokens else: full_sent_tokens = new_sent_tokens sentences_w_word[word] = full_sent_tokens path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) for word in keyword_list: analysis(word, file) with open('wordlist.json', 'w', encoding="utf8") as outfile: json.dump(sentences_w_word, outfile, ensure_ascii=False) # def analysis(file, id): # sent_tokens = sent_tokenize(file) # sentence tokenizing # for sent_token in sent_tokens: # tokens = word_tokenize(sent_token) # word tokenizing # print(tokens) # for token in tokens: # for first in keyword_list: # if token == first: # if token is in keyword_list # if token not in wordlist: # wordlist[token] = [] # sent_dict = {} # sent_dict["id"]=id # sent_dict["sentence"] = sent_token.replace('\n', ' ') # wordlist[token].append(sent_dict) # elif token not in avoiding_repetition: # # print(wordlist[token]) # sent_dict = {} # sent_dict["id"]=id # sent_dict["sentence"] = sent_token.replace('\n', ' ') # wordlist[token].append(sent_dict) # avoiding_repetition.append(token) # with open('static/files/17/17.blurb.html') as f: # content = f.read() # analysis(content, '17') # # reading each individual html file # path = "static/files/" # for path, subdirs, files in os.walk(path): # for name in files: # if name.endswith('html'): # file = os.path.join(path, name) # with open(file) as f: # content = f.read() # id=name[:2] # analysis(content, id) # json_wordlist = json.dumps(wordlist) # for item in wordlist: # for item2 in wordlist[item]: # print(item) # print(item2["sentence"]) # print("\n")