diff --git a/static/files/.DS_Store b/static/files/.DS_Store index b1d674c..b10cdea 100644 Binary files a/static/files/.DS_Store and b/static/files/.DS_Store differ diff --git a/static/files/00. Pushing Scores/00.json b/static/files/00. Pushing Scores/00.json deleted file mode 100644 index 9bc2eff..0000000 --- a/static/files/00. Pushing Scores/00.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "id": "00.", - "name": "Yes", - "email": "sure@whynot.com", - "friend": "Y", - "content": "A sound file.", - "files":[ - "files/00. Pushing Scores/Participants.txt", - "files/00. Pushing Scores/PushingScores.html", - "files/00. Pushing Scores/events.txt" - - ] -} diff --git a/textedit.py b/textedit.py new file mode 100644 index 0000000..dbd29b1 --- /dev/null +++ b/textedit.py @@ -0,0 +1,26 @@ +# this code is split in two parts: +# going through the description html files and gathering the interesting words in a json file; +# and going through the files again to replace words that also appear in the json with an a href version + +import sys, os +import nltk +from nltk import word_tokenize +from nltk.util import trigrams + +# text analysis +def analysis(file): + # print("yes") + file_trigrams = trigrams(content) + print(file_trigrams) + + + +# reading each individual html file +path = "static/files/" +for path, subdirs, files in os.walk(path): + for name in files: + if name.endswith('html'): + file = os.path.join(path, name) + with open(file) as f: + content = f.read() + analysis(content)