mb@mb
6 years ago
commit
d56ddde273
10 changed files with 15804 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||
txt/* |
|||
__pycache__/ |
@ -0,0 +1,33 @@ |
|||
# Grrrrrrrrrrls - search machine (prototype) |
|||
|
|||
A small flask exercise, combining the TFIDF algorithm written in python with a web interface. |
|||
|
|||
Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique. |
|||
|
|||
# Install |
|||
|
|||
$ pip3 install flask |
|||
|
|||
$ pip3 install nltk |
|||
|
|||
# Txt documents |
|||
|
|||
The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that. |
|||
|
|||
If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you). |
|||
|
|||
# Start |
|||
|
|||
Start the flask/python local server ... |
|||
|
|||
$ python3 start.py |
|||
|
|||
Browse to your localhost on port 5000 ... |
|||
|
|||
> 127.0.0.1:5000 |
|||
|
|||
## Notes |
|||
|
|||
This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word. |
|||
|
|||
This is a prototype :) |
File diff suppressed because it is too large
@ -0,0 +1,40 @@ |
|||
#!/usr/bin/env python3 |
|||
|
|||
import os |
|||
import flask |
|||
from flask import request |
|||
import tfidf |
|||
|
|||
def get_index(): |
|||
index = tfidf.load_index() |
|||
return index |
|||
|
|||
def get_results(query): |
|||
results, index = tfidf.request_results(query) |
|||
return results, index |
|||
|
|||
# Create the application. |
|||
APP = flask.Flask(__name__) |
|||
|
|||
@APP.route('/', methods=['GET', 'POST']) |
|||
def index(): |
|||
""" Displays the index page accessible at '/' |
|||
""" |
|||
query = None |
|||
results = None |
|||
|
|||
if request.args.get('q', ''): |
|||
query = request.args.get('q', '') |
|||
results, index = get_results(query) |
|||
files = [manifesto for manifesto, _ in index.items()] |
|||
return flask.render_template('results.html', query=query, results=results, files=files) |
|||
else: |
|||
index = get_index() |
|||
files = [manifesto for manifesto, _ in index.items()] |
|||
return flask.render_template('index.html', files=files) |
|||
|
|||
if __name__ == '__main__': |
|||
if not 'index.json' in os.listdir('.'): |
|||
tfidf.create_index() |
|||
APP.debug=True |
|||
APP.run() |
@ -0,0 +1,81 @@ |
|||
body{ |
|||
background-color: rgba(220,220,220,0.6); |
|||
margin:20px; |
|||
font-family: sans-serif; |
|||
font-size: 14px; |
|||
} |
|||
h1, h2, h3{ |
|||
font-size: 100%; |
|||
margin:30px 0 0 0; |
|||
} |
|||
h2{ |
|||
font-size: 12px; |
|||
font-weight: normal; |
|||
border-bottom:1px solid; |
|||
} |
|||
#logo, #search{ |
|||
display: inline-block; |
|||
} |
|||
#logo{ |
|||
margin:15px 0; |
|||
} |
|||
#search{ |
|||
position: relative; |
|||
width: 300px; |
|||
margin:0 0 0 20px; |
|||
top:-3px; |
|||
} |
|||
#search input#query{ |
|||
width: 100%; |
|||
height: 42px; |
|||
padding:0px 10px; |
|||
border:1px solid rgba(190,190,190,1); |
|||
vertical-align: baseline; |
|||
} |
|||
#search #submit{ |
|||
position: absolute; |
|||
width: 26px; |
|||
height: 26px; |
|||
right: -12px; |
|||
top:9px; |
|||
border:0; |
|||
border-radius: 100%; |
|||
background-color:transparent; |
|||
text-align: center; |
|||
} |
|||
#search #submit:hover{ |
|||
cursor: pointer; |
|||
} |
|||
#results, #intro{ |
|||
width:calc(100% - 371px); |
|||
margin:10px 0 0 0; |
|||
} |
|||
.result{ |
|||
margin:10px 0 0 0; |
|||
} |
|||
.sentence{ |
|||
margin:10px 0 0 0; |
|||
} |
|||
.sentence strong{ |
|||
color:#800000; |
|||
} |
|||
#txt-list{ |
|||
position: absolute; |
|||
width:200px; |
|||
right: 0px; |
|||
top:-7px; |
|||
margin:20px; |
|||
font-size: 12px; |
|||
color:#800000; |
|||
} |
|||
#txt-list ul{ |
|||
margin:0; |
|||
padding:0; |
|||
} |
|||
#txt-list ul li{ |
|||
margin:0; |
|||
padding:0; |
|||
text-indent: -38px; |
|||
list-style: none; |
|||
word-break: break-all; |
|||
} |
After Width: | Height: | Size: 8.4 KiB |
@ -0,0 +1,30 @@ |
|||
<!DOCTYPE html> |
|||
<html lang='en'> |
|||
<head> |
|||
<meta charset="utf-8" /> |
|||
<title>Grrrrrrrrrrls {% block title %}{% endblock %}</title> |
|||
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" /> |
|||
</head> |
|||
<body> |
|||
<div id="logo"> |
|||
<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a> |
|||
</div> |
|||
<div id="search"> |
|||
<form action="" method="GET"> |
|||
<input id="query" name="q" value="{{query}}"/> |
|||
<input id="submit" type="submit" value="➜"/> |
|||
</form> |
|||
</div> |
|||
<div id="txt-list"> |
|||
<p>Searching through <em>and calculating words of</em> the following txt documents:</p> |
|||
|
|||
<ul> |
|||
{% for txt in files %} |
|||
<li>{{txt}}</li> |
|||
{% endfor %} |
|||
<ul> |
|||
</div> |
|||
{% block results %} |
|||
{% endblock %} |
|||
</body> |
|||
</html> |
@ -0,0 +1,8 @@ |
|||
{% extends "base.html" %} |
|||
|
|||
{% block results %} |
|||
<div id="intro"> |
|||
<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p> |
|||
</div> |
|||
|
|||
{% endblock %} |
@ -0,0 +1,22 @@ |
|||
{% extends "base.html" %} |
|||
{% block title %}{{query}}{% endblock %} |
|||
{% block results %} |
|||
<h1>The results for the query "{{query}}" are:</h1> |
|||
<div id="results"> |
|||
{% if results == {} %} |
|||
<div>That word is not used in any of the manifesto's.</div> |
|||
{% else %} |
|||
{% for _, manifesto in results.items() %} |
|||
|
|||
<div class="result"> |
|||
<h2>{{manifesto.name}}</h2> |
|||
<div class="sentences"> |
|||
{% for sentence in manifesto.sentences %} |
|||
<div class="sentence">{{sentence}}</div> |
|||
{% endfor %} |
|||
</div> |
|||
</div> |
|||
{% endfor %} |
|||
{% endif %} |
|||
</div> |
|||
{% endblock %} |
@ -0,0 +1,145 @@ |
|||
import os, json |
|||
from math import log, exp |
|||
from flask import Markup |
|||
|
|||
from nltk import sent_tokenize |
|||
from nltk.tokenize import RegexpTokenizer |
|||
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer |
|||
|
|||
import pprint |
|||
pp = pprint.PrettyPrinter(indent=4) |
|||
|
|||
def tfidf(query, manifesto, corpus): |
|||
# Term Frequency |
|||
tf_count = 0 |
|||
for word in manifesto: |
|||
if query == word: |
|||
tf_count += 1 |
|||
tf = tf_count/len(manifesto) |
|||
# print('count:', tf_count) |
|||
# print('total:', len(manifesto)) |
|||
# print('TF - count/total', tf_count/len(manifesto)) |
|||
|
|||
# Inverse Document Frequency |
|||
idf_count = 0 |
|||
for words in corpus: |
|||
if query in words: |
|||
idf_count += 1 |
|||
# print('count:', idf_count) |
|||
idf = log(len(corpus)/idf_count) |
|||
# print('documents:', len(corpus)) |
|||
# print('documents/count', len(corpus)/idf_count) |
|||
# print('IDF - log(documents/count)', log(len(corpus)/idf_count)) |
|||
|
|||
tfidf_value = tf * idf |
|||
# print('TF-IDF:', tfidf_value) |
|||
|
|||
return tf_count, idf_count, tfidf_value |
|||
|
|||
def load_text_files(): |
|||
files = [] |
|||
corpus = [] |
|||
sentences = {} |
|||
dir = 'txt' |
|||
|
|||
for f in sorted(os.listdir(dir)): |
|||
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file |
|||
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation |
|||
corpus.append(words) # all words of one manifesto, in reading order |
|||
s = sent_tokenize(lines) |
|||
manifesto = f.replace('.txt','') |
|||
sentences[manifesto] = s |
|||
files.append(manifesto) # list of filenames |
|||
|
|||
print('*txt files loaded*') |
|||
return files, corpus, sentences |
|||
|
|||
def create_index(): |
|||
files, corpus, sentences = load_text_files() |
|||
index = {} |
|||
|
|||
# index = { |
|||
# Fem manifesto : { |
|||
# 'words' : { |
|||
# 'aap': 39.2, |
|||
# 'beer': 20.456, |
|||
# 'citroen': 3.21 |
|||
# } |
|||
# } |
|||
# } |
|||
|
|||
for i, words in enumerate(corpus): |
|||
manifesto = files[i] |
|||
index[manifesto] = {} |
|||
for word in words: |
|||
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) |
|||
if 'words' not in index[manifesto]: |
|||
index[manifesto]['words'] = {} |
|||
index[manifesto]['words'][word] = tfidf_value |
|||
|
|||
with open('index.json','w+') as out: |
|||
out.write(json.dumps(index, indent=4, sort_keys=True)) |
|||
out.close() |
|||
print('*index created*') |
|||
|
|||
def load_index(): |
|||
f = open('index.json').read() |
|||
index = json.loads(f) |
|||
return index |
|||
|
|||
def request_results(query): |
|||
query = query.strip() |
|||
files, corpus, sentences = load_text_files() |
|||
f = open('index.json').read() |
|||
index = json.loads(f) |
|||
results = {} |
|||
|
|||
# results = { |
|||
# 0 : { |
|||
# 'name' : 'Fem_manifesto', |
|||
# 'value' : 0.00041, |
|||
# 'sentences' : [ |
|||
# 'This is a first sentence.', |
|||
# 'This is a second sentence.', |
|||
# 'This is a third sentence.' |
|||
# ] |
|||
# } |
|||
# } |
|||
|
|||
# make a list of manifesto's that use the query word |
|||
result_matches = [] |
|||
for manifesto, d in index.items(): |
|||
for word, value in d['words'].items(): |
|||
if query == word: |
|||
result_matches.append([value, manifesto]) |
|||
|
|||
result_matches.sort(reverse=True) |
|||
for x, result in enumerate(result_matches): |
|||
results[x] = {} |
|||
results[x]['tfidf'] = result[0] |
|||
results[x]['name'] = result[1] |
|||
|
|||
# pp.pprint(results) |
|||
|
|||
# make a list of sentences that contain the query word |
|||
# and shape results object |
|||
for x, manifesto in results.items(): |
|||
sents = sentences[manifesto['name']] |
|||
value = manifesto['tfidf'] * 10000 |
|||
result_sentences = [] |
|||
count = 0 |
|||
for s in sents: |
|||
done = 'no' |
|||
for word in tokenizer.tokenize(s): |
|||
if word == query: |
|||
if count < 3: # set to include a max 3 results/manifesto in the results list |
|||
count += 1 |
|||
if done is not 'yes': |
|||
sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query)) |
|||
html = Markup(sentence) |
|||
result_sentences.append(html) |
|||
done = 'yes' |
|||
results[x]['sentences'] = result_sentences |
|||
|
|||
print('*results returned*') |
|||
return results, index |
Loading…
Reference in new issue