import sys, os from nltk import sent_tokenize, word_tokenize from nltk import everygrams from nltk import FreqDist import json import re """ PART 1 We create the dictionary and save it. """ stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"] path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) total = open("allhtml.txt", "a") with open(file, 'r+') as f: content = f.read() total.write(content) total.close() keyword_list = [] with open('allhtml.txt') as f: content = f.read() tokens = word_tokenize(content) tokens = [token for token in tokens if token not in stopws] keyword_list = list(set(tokens)) # print(tokens) # print(keyword_list) """ PART 2 We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file. """ # wordlist = {} # avoiding_repetition = [] sentences_w_word = {} def analysis(the_word, file_name): id = file_name[13:15] with open(file_name, 'r+') as f: content = f.read() sent_tokens = sent_tokenize(content) new_sent_tokens = [] for sent_token in sent_tokens: if the_word in sent_token: new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")}) if the_word in sentences_w_word: # if this is not the first iteration previous_sent_tokens = sentences_w_word[the_word] full_sent_tokens = previous_sent_tokens + new_sent_tokens else: full_sent_tokens = new_sent_tokens sentences_w_word[the_word] = full_sent_tokens # maybe ISO-8859-1 instead of utf8?? path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) for word in keyword_list: analysis(word, file) with open('wordlist.json', 'w') as outfile: json.dump(sentences_w_word, outfile, ensure_ascii=False)