# this code is split in two parts: # going through the description html files and gathering the interesting words in a json file; # and going through the files again to replace words that also appear in the json with an a href version import sys, os import nltk from nltk import word_tokenize from nltk.util import trigrams # text analysis def analysis(file): # print("yes") file_trigrams = trigrams(content) print(file_trigrams) # reading each individual html file path = "static/files/" for path, subdirs, files in os.walk(path): for name in files: if name.endswith('html'): file = os.path.join(path, name) with open(file) as f: content = f.read() analysis(content)