Cristina Cochior
6 years ago
23 changed files with 41001 additions and 97 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -0,0 +1,18 @@ |
|||
#!/usr/bin/env python |
|||
import sys, os |
|||
import json |
|||
import re |
|||
|
|||
with open('wordlist.json', 'r', encoding='utf-8') as f: |
|||
wordlist_dict = json.load(f) |
|||
|
|||
path = "static/files/" |
|||
for path, subdirs, files in os.walk(path): |
|||
for name in files: |
|||
if name.endswith('html'): |
|||
file = os.path.join(path, name) |
|||
with open(file, encoding='utf-8') as f: |
|||
textfile = f.read() |
|||
for word in wordlist_dict: |
|||
wordlinked = "<a href='/diverge?search="+word+"'>"+word+"</a>" |
|||
textfile = re.sub(word, wordlinked, textfile) |
@ -1,38 +0,0 @@ |
|||
import json |
|||
|
|||
|
|||
# # to iterate through existing json file and find the correct json file |
|||
# def find_json(id): |
|||
# get path/to/file |
|||
|
|||
# return file |
|||
|
|||
# # |
|||
# def save_json(id, name, email, friend, content): |
|||
# file |
|||
# data = {"id": "path/to/file", "name":,"email":,"friend":,"content":} |
|||
|
|||
# with open('file.json', 'w') as f: |
|||
# json.dump(data, f) |
|||
|
|||
|
|||
|
|||
|
|||
# def jaction(original, id, name, email, friend, content): |
|||
# f = find_json_file(id) |
|||
# data = make_dict(f) |
|||
|
|||
# updated = update_dict(data, name, email, friend, content) |
|||
# save_json_file(f, updated) |
|||
|
|||
# # to find the file with the correct id |
|||
# def find_json_file(): |
|||
# f = open('file.json', 'w') |
|||
# iterate files to find id |
|||
# return f |
|||
|
|||
# # saving the json file |
|||
# def save_json_file(name, email, friend, content): |
|||
# dict= request.args.get( |
|||
# write(file, json.dump(data)) |
|||
|
@ -0,0 +1 @@ |
|||
[('graphic', 540), ('sound', 510), ('Rotterdam', 480), ('nl', 480), ('music', 450), ('notation', 420), ('project', 420), ('de', 390), ('new', 360), ('The', 360), ('DE', 360), ('PLAYER', 360), ('TGC', 330), ('art', 300), ('3', 300), ('van', 270), ('performance', 270), ('Gamma', 270), ('Circulaire', 270), ('event', 240), ('Tetra', 240), ("'", 240), ('score', 210), ('release', 210), ('Kris', 210), ('2017', 180), ('artists', 180), ('scores', 180), ('Antwerp', 180), ('2.0', 180), ('George', 180), ('I', 180), ('Remco', 150), ('Bladel', 150), ('For', 150), ('publishing', 150), ('Score', 150), ('us', 150), ('XPUB', 150), ('magazine', 150), ('Media', 150), ('2018', 150), ('Paradiso', 150), ('This', 150), ('research', 150), ('Vaast', 150), ('Colson', 150), ('Art', 150), ('avant-garde', 150), ('Remörk', 150)] |
@ -0,0 +1,27 @@ |
|||
import sys, os |
|||
from nltk import word_tokenize |
|||
from nltk import everygrams |
|||
from nltk import FreqDist |
|||
|
|||
stopws = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", ",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!"] |
|||
|
|||
|
|||
path = "static/files/" |
|||
for path, subdirs, files in os.walk(path): |
|||
for name in files: |
|||
if name.endswith('html'): |
|||
file = os.path.join(path, name) |
|||
total = open("allhtml.txt", "a") |
|||
with open(file) as f: |
|||
content = f.read() |
|||
total.write(content) |
|||
total.close() |
|||
|
|||
with open('allhtml.txt') as f: |
|||
content = f.read() |
|||
tokens = word_tokenize(content) |
|||
tokens = [token for token in tokens if token not in stopws] |
|||
freq_file=FreqDist(tokens) |
|||
listofwords = open("mostcommon.txt", "w+") |
|||
listofwords.write(str(freq_file.most_common(50))) |
|||
listofwords.close() |
Binary file not shown.
@ -0,0 +1,5 @@ |
|||
Here is a description of the event / artist / work. |
|||
|
|||
For the event, it can describe what happened, where it happened, when it happened. |
|||
For the artist, it can introduce the artist. |
|||
For the work, it can describe the content. |
@ -0,0 +1 @@ |
|||
Why was the artist / piece / event included in the collection. |
@ -0,0 +1 @@ |
|||
Here you can go into length about an artists’ practice. For example, interviews, personal statements, reviews etc. |
@ -1,26 +1,132 @@ |
|||
# this code is split in two parts: |
|||
# going through the description html files and gathering the interesting words in a json file; |
|||
# and going through the files again to replace words that also appear in the json with an a href version |
|||
|
|||
#!/usr/bin/env python |
|||
# -*- coding: utf-8 -*- |
|||
import sys, os |
|||
import nltk |
|||
from nltk import word_tokenize |
|||
from nltk.util import trigrams |
|||
|
|||
# text analysis |
|||
def analysis(file): |
|||
# print("yes") |
|||
file_trigrams = trigrams(content) |
|||
print(file_trigrams) |
|||
from nltk import sent_tokenize, word_tokenize |
|||
from nltk import everygrams |
|||
from nltk import FreqDist |
|||
import json |
|||
import re |
|||
|
|||
""" |
|||
PART 1 |
|||
We create the dictionary and save it. |
|||
""" |
|||
|
|||
stopws = [",", ".", "?","!",":","(",")",">","<","@","#","``","/","–","''","‘","-","’", "DOCTYPE", "html", "!", "'", "<br>", "<br />", "/body", "/html", "/head", "h2", "/h2", "h1", "/h1","”","“"] |
|||
|
|||
# reading each individual html file |
|||
path = "static/files/" |
|||
for path, subdirs, files in os.walk(path): |
|||
for name in files: |
|||
if name.endswith('html'): |
|||
file = os.path.join(path, name) |
|||
total = open("allhtml.txt", "a") |
|||
with open(file) as f: |
|||
content = f.read() |
|||
analysis(content) |
|||
total.write(content) |
|||
total.close() |
|||
|
|||
keyword_list = [] |
|||
|
|||
# with open('allhtml.txt') as f: |
|||
# content = f.read() |
|||
# tokens = word_tokenize(content) |
|||
# tokens = [token for token in tokens if token not in stopws] |
|||
# freq_file=FreqDist(tokens) |
|||
# print(tokens) |
|||
# keyword_list.append(freq_file.most_common(50)) |
|||
# print(keyword_list[0]) |
|||
|
|||
with open('allhtml.txt') as f: |
|||
content = f.read() |
|||
tokens = word_tokenize(content) |
|||
tokens = [token for token in tokens if token not in stopws] |
|||
keyword_list = list(set(tokens)) |
|||
# print(tokens) |
|||
# print(keyword_list) |
|||
|
|||
""" |
|||
PART 2 |
|||
We iterate through the entire collection of html files, tokenize the words, and check to see whether any of them is in the keyword_list. If they are, then we generate a json file. |
|||
""" |
|||
|
|||
# wordlist = {} |
|||
# avoiding_repetition = [] |
|||
|
|||
|
|||
sentences_w_word = {} |
|||
|
|||
def analysis(the_word, file_name): |
|||
id = file_name[13:15] |
|||
with open(file_name) as f: |
|||
content = f.read() |
|||
sent_tokens = sent_tokenize(content) |
|||
new_sent_tokens = [] |
|||
for sent_token in sent_tokens: |
|||
if the_word in sent_token: |
|||
new_sent_tokens.append({'id': id, 'sentence': sent_token.replace('\n', ' ').strip("'<>()")}) |
|||
if the_word in sentences_w_word: # if this is not the first iteration |
|||
previous_sent_tokens = sentences_w_word[the_word] |
|||
full_sent_tokens = previous_sent_tokens + new_sent_tokens |
|||
else: |
|||
full_sent_tokens = new_sent_tokens |
|||
sentences_w_word[word] = full_sent_tokens |
|||
|
|||
|
|||
|
|||
path = "static/files/" |
|||
for path, subdirs, files in os.walk(path): |
|||
for name in files: |
|||
if name.endswith('html'): |
|||
file = os.path.join(path, name) |
|||
for word in keyword_list: |
|||
analysis(word, file) |
|||
|
|||
with open('wordlist.json', 'w', encoding="utf8") as outfile: |
|||
json.dump(sentences_w_word, outfile, ensure_ascii=False) |
|||
|
|||
|
|||
# def analysis(file, id): |
|||
# sent_tokens = sent_tokenize(file) # sentence tokenizing |
|||
# for sent_token in sent_tokens: |
|||
# tokens = word_tokenize(sent_token) # word tokenizing |
|||
# print(tokens) |
|||
# for token in tokens: |
|||
# for first in keyword_list: |
|||
# if token == first: # if token is in keyword_list |
|||
# if token not in wordlist: |
|||
# wordlist[token] = [] |
|||
# sent_dict = {} |
|||
# sent_dict["id"]=id |
|||
# sent_dict["sentence"] = sent_token.replace('\n', ' ') |
|||
# wordlist[token].append(sent_dict) |
|||
# elif token not in avoiding_repetition: |
|||
# # print(wordlist[token]) |
|||
# sent_dict = {} |
|||
# sent_dict["id"]=id |
|||
# sent_dict["sentence"] = sent_token.replace('\n', ' ') |
|||
# wordlist[token].append(sent_dict) |
|||
# avoiding_repetition.append(token) |
|||
|
|||
|
|||
# with open('static/files/17/17.blurb.html') as f: |
|||
# content = f.read() |
|||
# analysis(content, '17') |
|||
|
|||
|
|||
# # reading each individual html file |
|||
# path = "static/files/" |
|||
# for path, subdirs, files in os.walk(path): |
|||
# for name in files: |
|||
# if name.endswith('html'): |
|||
# file = os.path.join(path, name) |
|||
# with open(file) as f: |
|||
# content = f.read() |
|||
# id=name[:2] |
|||
# analysis(content, id) |
|||
|
|||
# json_wordlist = json.dumps(wordlist) |
|||
# for item in wordlist: |
|||
# for item2 in wordlist[item]: |
|||
# print(item) |
|||
# print(item2["sentence"]) |
|||
# print("\n") |
|||
|
File diff suppressed because one or more lines are too long
@ -1,13 +0,0 @@ |
|||
{ |
|||
|
|||
"way" : [ |
|||
{"id": ["17", "He described his own art as a way of 'ensuring that the details of everyday life, the random constellations of objects that surround us, stop going unnoticed.'"]}, |
|||
{"id": ["00", "Our ambition, and that of our collaborating partners, is to emancipate graphic notation from the confines of the modernist tradition, in such a way that it may remain an innovative and provocative medium for decades to come."]} |
|||
], |
|||
|
|||
"artwork" : [ |
|||
{"id": ["17", "One of the originators of 'participatory' art, in which the artwork can only be experienced by the active involvement of the viewer, he is most famous for his Event Scores such as Drip Music 1962, and is widely seen as an important precursor to conceptual art."]}, |
|||
{"id": ["00", "It unfolds through a nomadic program which includes the creation of newly commissioned artworks and public events that addres scontemporary questions and issues in this particular field.", |
|||
"The discursive program for 2016–2017 will include lectures, presentations of newly commissioned artworks, concert evenings, and workshops."]} |
|||
] |
|||
} |
Loading…
Reference in new issue