import sys, os from nltk import sent_tokenize from nltk import everygrams from nltk import FreqDist import json import re from tqdm import tqdm """ PART 1 We create the dictionary and save it. """ stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "
", "
", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“", "o", "Ca", "/", "[", "]", "(", ")", "&", "%", "l", "n't", "t", "T", "S"] path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) total = open("allhtml.txt", "a") with open(file, 'r+') as f: content = f.read() total.write(content) total.close() keyword_list = [] with open('allhtml.txt') as f: content = f.read() # tokens = word_tokenize(content) tokens = re.compile("(?!-)[\W]+").split(content) tokens.remove("") tokens = [token for token in tokens if token not in stopws] keyword_list = list(set(tokens)) """ PART 2 We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file. """ sentences_w_word = {} def analysis(the_word, file_name): id = file_name[13:15] with open(file_name, 'r+') as f: content = f.read() sent_tokens = sent_tokenize(content) new_sent_tokens = [] re_word = r"\b" + re.escape(the_word) + r"\b" for sent_token in sent_tokens: if re.search(re_word, sent_token): new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()“”")}) if the_word in sentences_w_word: # if this is not the first iteration previous_sent_tokens = sentences_w_word[the_word] full_sent_tokens = previous_sent_tokens + new_sent_tokens else: full_sent_tokens = new_sent_tokens sentences_w_word[the_word] = full_sent_tokens path = "static/files/" for path, subdirs, files in tqdm(os.walk(path)): for name in files: if name.endswith('html'): file = os.path.join(path, name) for word in keyword_list: analysis(word, file) with open('wordlist.json', 'w') as outfile: json.dump(sentences_w_word, outfile, ensure_ascii=False)