You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
26 lines
749 B
26 lines
749 B
# this code is split in two parts:
|
|
# going through the description html files and gathering the interesting words in a json file;
|
|
# and going through the files again to replace words that also appear in the json with an a href version
|
|
|
|
import sys, os
|
|
import nltk
|
|
from nltk import word_tokenize
|
|
from nltk.util import trigrams
|
|
|
|
# text analysis
|
|
def analysis(file):
|
|
# print("yes")
|
|
file_trigrams = trigrams(content)
|
|
print(file_trigrams)
|
|
|
|
|
|
|
|
# reading each individual html file
|
|
path = "static/files/"
|
|
for path, subdirs, files in os.walk(path):
|
|
for name in files:
|
|
if name.endswith('html'):
|
|
file = os.path.join(path, name)
|
|
with open(file) as f:
|
|
content = f.read()
|
|
analysis(content)
|
|
|