mb@mb
6 years ago
commit
d56ddde273
10 changed files with 15804 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||||
|
txt/* |
||||
|
__pycache__/ |
@ -0,0 +1,33 @@ |
|||||
|
# Grrrrrrrrrrls - search machine (prototype) |
||||
|
|
||||
|
A small flask exercise, combining the TFIDF algorithm written in python with a web interface. |
||||
|
|
||||
|
Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique. |
||||
|
|
||||
|
# Install |
||||
|
|
||||
|
$ pip3 install flask |
||||
|
|
||||
|
$ pip3 install nltk |
||||
|
|
||||
|
# Txt documents |
||||
|
|
||||
|
The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that. |
||||
|
|
||||
|
If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you). |
||||
|
|
||||
|
# Start |
||||
|
|
||||
|
Start the flask/python local server ... |
||||
|
|
||||
|
$ python3 start.py |
||||
|
|
||||
|
Browse to your localhost on port 5000 ... |
||||
|
|
||||
|
> 127.0.0.1:5000 |
||||
|
|
||||
|
## Notes |
||||
|
|
||||
|
This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word. |
||||
|
|
||||
|
This is a prototype :) |
File diff suppressed because it is too large
@ -0,0 +1,40 @@ |
|||||
|
#!/usr/bin/env python3 |
||||
|
|
||||
|
import os |
||||
|
import flask |
||||
|
from flask import request |
||||
|
import tfidf |
||||
|
|
||||
|
def get_index(): |
||||
|
index = tfidf.load_index() |
||||
|
return index |
||||
|
|
||||
|
def get_results(query): |
||||
|
results, index = tfidf.request_results(query) |
||||
|
return results, index |
||||
|
|
||||
|
# Create the application. |
||||
|
APP = flask.Flask(__name__) |
||||
|
|
||||
|
@APP.route('/', methods=['GET', 'POST']) |
||||
|
def index(): |
||||
|
""" Displays the index page accessible at '/' |
||||
|
""" |
||||
|
query = None |
||||
|
results = None |
||||
|
|
||||
|
if request.args.get('q', ''): |
||||
|
query = request.args.get('q', '') |
||||
|
results, index = get_results(query) |
||||
|
files = [manifesto for manifesto, _ in index.items()] |
||||
|
return flask.render_template('results.html', query=query, results=results, files=files) |
||||
|
else: |
||||
|
index = get_index() |
||||
|
files = [manifesto for manifesto, _ in index.items()] |
||||
|
return flask.render_template('index.html', files=files) |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
if not 'index.json' in os.listdir('.'): |
||||
|
tfidf.create_index() |
||||
|
APP.debug=True |
||||
|
APP.run() |
@ -0,0 +1,81 @@ |
|||||
|
body{ |
||||
|
background-color: rgba(220,220,220,0.6); |
||||
|
margin:20px; |
||||
|
font-family: sans-serif; |
||||
|
font-size: 14px; |
||||
|
} |
||||
|
h1, h2, h3{ |
||||
|
font-size: 100%; |
||||
|
margin:30px 0 0 0; |
||||
|
} |
||||
|
h2{ |
||||
|
font-size: 12px; |
||||
|
font-weight: normal; |
||||
|
border-bottom:1px solid; |
||||
|
} |
||||
|
#logo, #search{ |
||||
|
display: inline-block; |
||||
|
} |
||||
|
#logo{ |
||||
|
margin:15px 0; |
||||
|
} |
||||
|
#search{ |
||||
|
position: relative; |
||||
|
width: 300px; |
||||
|
margin:0 0 0 20px; |
||||
|
top:-3px; |
||||
|
} |
||||
|
#search input#query{ |
||||
|
width: 100%; |
||||
|
height: 42px; |
||||
|
padding:0px 10px; |
||||
|
border:1px solid rgba(190,190,190,1); |
||||
|
vertical-align: baseline; |
||||
|
} |
||||
|
#search #submit{ |
||||
|
position: absolute; |
||||
|
width: 26px; |
||||
|
height: 26px; |
||||
|
right: -12px; |
||||
|
top:9px; |
||||
|
border:0; |
||||
|
border-radius: 100%; |
||||
|
background-color:transparent; |
||||
|
text-align: center; |
||||
|
} |
||||
|
#search #submit:hover{ |
||||
|
cursor: pointer; |
||||
|
} |
||||
|
#results, #intro{ |
||||
|
width:calc(100% - 371px); |
||||
|
margin:10px 0 0 0; |
||||
|
} |
||||
|
.result{ |
||||
|
margin:10px 0 0 0; |
||||
|
} |
||||
|
.sentence{ |
||||
|
margin:10px 0 0 0; |
||||
|
} |
||||
|
.sentence strong{ |
||||
|
color:#800000; |
||||
|
} |
||||
|
#txt-list{ |
||||
|
position: absolute; |
||||
|
width:200px; |
||||
|
right: 0px; |
||||
|
top:-7px; |
||||
|
margin:20px; |
||||
|
font-size: 12px; |
||||
|
color:#800000; |
||||
|
} |
||||
|
#txt-list ul{ |
||||
|
margin:0; |
||||
|
padding:0; |
||||
|
} |
||||
|
#txt-list ul li{ |
||||
|
margin:0; |
||||
|
padding:0; |
||||
|
text-indent: -38px; |
||||
|
list-style: none; |
||||
|
word-break: break-all; |
||||
|
} |
After Width: | Height: | Size: 8.4 KiB |
@ -0,0 +1,30 @@ |
|||||
|
<!DOCTYPE html> |
||||
|
<html lang='en'> |
||||
|
<head> |
||||
|
<meta charset="utf-8" /> |
||||
|
<title>Grrrrrrrrrrls {% block title %}{% endblock %}</title> |
||||
|
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" /> |
||||
|
</head> |
||||
|
<body> |
||||
|
<div id="logo"> |
||||
|
<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a> |
||||
|
</div> |
||||
|
<div id="search"> |
||||
|
<form action="" method="GET"> |
||||
|
<input id="query" name="q" value="{{query}}"/> |
||||
|
<input id="submit" type="submit" value="➜"/> |
||||
|
</form> |
||||
|
</div> |
||||
|
<div id="txt-list"> |
||||
|
<p>Searching through <em>and calculating words of</em> the following txt documents:</p> |
||||
|
|
||||
|
<ul> |
||||
|
{% for txt in files %} |
||||
|
<li>{{txt}}</li> |
||||
|
{% endfor %} |
||||
|
<ul> |
||||
|
</div> |
||||
|
{% block results %} |
||||
|
{% endblock %} |
||||
|
</body> |
||||
|
</html> |
@ -0,0 +1,8 @@ |
|||||
|
{% extends "base.html" %} |
||||
|
|
||||
|
{% block results %} |
||||
|
<div id="intro"> |
||||
|
<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p> |
||||
|
</div> |
||||
|
|
||||
|
{% endblock %} |
@ -0,0 +1,22 @@ |
|||||
|
{% extends "base.html" %} |
||||
|
{% block title %}{{query}}{% endblock %} |
||||
|
{% block results %} |
||||
|
<h1>The results for the query "{{query}}" are:</h1> |
||||
|
<div id="results"> |
||||
|
{% if results == {} %} |
||||
|
<div>That word is not used in any of the manifesto's.</div> |
||||
|
{% else %} |
||||
|
{% for _, manifesto in results.items() %} |
||||
|
|
||||
|
<div class="result"> |
||||
|
<h2>{{manifesto.name}}</h2> |
||||
|
<div class="sentences"> |
||||
|
{% for sentence in manifesto.sentences %} |
||||
|
<div class="sentence">{{sentence}}</div> |
||||
|
{% endfor %} |
||||
|
</div> |
||||
|
</div> |
||||
|
{% endfor %} |
||||
|
{% endif %} |
||||
|
</div> |
||||
|
{% endblock %} |
@ -0,0 +1,145 @@ |
|||||
|
import os, json |
||||
|
from math import log, exp |
||||
|
from flask import Markup |
||||
|
|
||||
|
from nltk import sent_tokenize |
||||
|
from nltk.tokenize import RegexpTokenizer |
||||
|
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer |
||||
|
|
||||
|
import pprint |
||||
|
pp = pprint.PrettyPrinter(indent=4) |
||||
|
|
||||
|
def tfidf(query, manifesto, corpus): |
||||
|
# Term Frequency |
||||
|
tf_count = 0 |
||||
|
for word in manifesto: |
||||
|
if query == word: |
||||
|
tf_count += 1 |
||||
|
tf = tf_count/len(manifesto) |
||||
|
# print('count:', tf_count) |
||||
|
# print('total:', len(manifesto)) |
||||
|
# print('TF - count/total', tf_count/len(manifesto)) |
||||
|
|
||||
|
# Inverse Document Frequency |
||||
|
idf_count = 0 |
||||
|
for words in corpus: |
||||
|
if query in words: |
||||
|
idf_count += 1 |
||||
|
# print('count:', idf_count) |
||||
|
idf = log(len(corpus)/idf_count) |
||||
|
# print('documents:', len(corpus)) |
||||
|
# print('documents/count', len(corpus)/idf_count) |
||||
|
# print('IDF - log(documents/count)', log(len(corpus)/idf_count)) |
||||
|
|
||||
|
tfidf_value = tf * idf |
||||
|
# print('TF-IDF:', tfidf_value) |
||||
|
|
||||
|
return tf_count, idf_count, tfidf_value |
||||
|
|
||||
|
def load_text_files(): |
||||
|
files = [] |
||||
|
corpus = [] |
||||
|
sentences = {} |
||||
|
dir = 'txt' |
||||
|
|
||||
|
for f in sorted(os.listdir(dir)): |
||||
|
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file |
||||
|
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation |
||||
|
corpus.append(words) # all words of one manifesto, in reading order |
||||
|
s = sent_tokenize(lines) |
||||
|
manifesto = f.replace('.txt','') |
||||
|
sentences[manifesto] = s |
||||
|
files.append(manifesto) # list of filenames |
||||
|
|
||||
|
print('*txt files loaded*') |
||||
|
return files, corpus, sentences |
||||
|
|
||||
|
def create_index(): |
||||
|
files, corpus, sentences = load_text_files() |
||||
|
index = {} |
||||
|
|
||||
|
# index = { |
||||
|
# Fem manifesto : { |
||||
|
# 'words' : { |
||||
|
# 'aap': 39.2, |
||||
|
# 'beer': 20.456, |
||||
|
# 'citroen': 3.21 |
||||
|
# } |
||||
|
# } |
||||
|
# } |
||||
|
|
||||
|
for i, words in enumerate(corpus): |
||||
|
manifesto = files[i] |
||||
|
index[manifesto] = {} |
||||
|
for word in words: |
||||
|
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus) |
||||
|
if 'words' not in index[manifesto]: |
||||
|
index[manifesto]['words'] = {} |
||||
|
index[manifesto]['words'][word] = tfidf_value |
||||
|
|
||||
|
with open('index.json','w+') as out: |
||||
|
out.write(json.dumps(index, indent=4, sort_keys=True)) |
||||
|
out.close() |
||||
|
print('*index created*') |
||||
|
|
||||
|
def load_index(): |
||||
|
f = open('index.json').read() |
||||
|
index = json.loads(f) |
||||
|
return index |
||||
|
|
||||
|
def request_results(query): |
||||
|
query = query.strip() |
||||
|
files, corpus, sentences = load_text_files() |
||||
|
f = open('index.json').read() |
||||
|
index = json.loads(f) |
||||
|
results = {} |
||||
|
|
||||
|
# results = { |
||||
|
# 0 : { |
||||
|
# 'name' : 'Fem_manifesto', |
||||
|
# 'value' : 0.00041, |
||||
|
# 'sentences' : [ |
||||
|
# 'This is a first sentence.', |
||||
|
# 'This is a second sentence.', |
||||
|
# 'This is a third sentence.' |
||||
|
# ] |
||||
|
# } |
||||
|
# } |
||||
|
|
||||
|
# make a list of manifesto's that use the query word |
||||
|
result_matches = [] |
||||
|
for manifesto, d in index.items(): |
||||
|
for word, value in d['words'].items(): |
||||
|
if query == word: |
||||
|
result_matches.append([value, manifesto]) |
||||
|
|
||||
|
result_matches.sort(reverse=True) |
||||
|
for x, result in enumerate(result_matches): |
||||
|
results[x] = {} |
||||
|
results[x]['tfidf'] = result[0] |
||||
|
results[x]['name'] = result[1] |
||||
|
|
||||
|
# pp.pprint(results) |
||||
|
|
||||
|
# make a list of sentences that contain the query word |
||||
|
# and shape results object |
||||
|
for x, manifesto in results.items(): |
||||
|
sents = sentences[manifesto['name']] |
||||
|
value = manifesto['tfidf'] * 10000 |
||||
|
result_sentences = [] |
||||
|
count = 0 |
||||
|
for s in sents: |
||||
|
done = 'no' |
||||
|
for word in tokenizer.tokenize(s): |
||||
|
if word == query: |
||||
|
if count < 3: # set to include a max 3 results/manifesto in the results list |
||||
|
count += 1 |
||||
|
if done is not 'yes': |
||||
|
sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query)) |
||||
|
html = Markup(sentence) |
||||
|
result_sentences.append(html) |
||||
|
done = 'yes' |
||||
|
results[x]['sentences'] = result_sentences |
||||
|
|
||||
|
print('*results returned*') |
||||
|
return results, index |
Loading…
Reference in new issue