You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

26 lines
749 B

# this code is split in two parts:
# going through the description html files and gathering the interesting words in a json file;
# and going through the files again to replace words that also appear in the json with an a href version
import sys, os
import nltk
from nltk import word_tokenize
from nltk.util import trigrams
# text analysis
def analysis(file):
# print("yes")
file_trigrams = trigrams(content)
print(file_trigrams)
# reading each individual html file
path = "static/files/"
for path, subdirs, files in os.walk(path):
for name in files:
if name.endswith('html'):
file = os.path.join(path, name)
with open(file) as f:
content = f.read()
analysis(content)