first commit

This commit is contained in:
mb@mb 2018-08-31 10:26:51 +02:00
commit d56ddde273
10 changed files with 15804 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
txt/*
__pycache__/

33
README.md Normal file
View File

@ -0,0 +1,33 @@
# Grrrrrrrrrrls - search machine (prototype)
A small flask exercise, combining the TFIDF algorithm written in python with a web interface.
Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique.
# Install
$ pip3 install flask
$ pip3 install nltk
# Txt documents
The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that.
If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you).
# Start
Start the flask/python local server ...
$ python3 start.py
Browse to your localhost on port 5000 ...
> 127.0.0.1:5000
## Notes
This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word.
This is a prototype :)

15309
index.json Normal file

File diff suppressed because it is too large Load Diff

40
start.py Normal file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
import os
import flask
from flask import request
import tfidf
def get_index():
index = tfidf.load_index()
return index
def get_results(query):
results, index = tfidf.request_results(query)
return results, index
# Create the application.
APP = flask.Flask(__name__)
@APP.route('/', methods=['GET', 'POST'])
def index():
""" Displays the index page accessible at '/'
"""
query = None
results = None
if request.args.get('q', ''):
query = request.args.get('q', '')
results, index = get_results(query)
files = [manifesto for manifesto, _ in index.items()]
return flask.render_template('results.html', query=query, results=results, files=files)
else:
index = get_index()
files = [manifesto for manifesto, _ in index.items()]
return flask.render_template('index.html', files=files)
if __name__ == '__main__':
if not 'index.json' in os.listdir('.'):
tfidf.create_index()
APP.debug=True
APP.run()

81
static/css/stylesheet.css Normal file
View File

@ -0,0 +1,81 @@
body{
background-color: rgba(220,220,220,0.6);
margin:20px;
font-family: sans-serif;
font-size: 14px;
}
h1, h2, h3{
font-size: 100%;
margin:30px 0 0 0;
}
h2{
font-size: 12px;
font-weight: normal;
border-bottom:1px solid;
}
#logo, #search{
display: inline-block;
}
#logo{
margin:15px 0;
}
#search{
position: relative;
width: 300px;
margin:0 0 0 20px;
top:-3px;
}
#search input#query{
width: 100%;
height: 42px;
padding:0px 10px;
border:1px solid rgba(190,190,190,1);
vertical-align: baseline;
}
#search #submit{
position: absolute;
width: 26px;
height: 26px;
right: -12px;
top:9px;
border:0;
border-radius: 100%;
background-color:transparent;
text-align: center;
}
#search #submit:hover{
cursor: pointer;
}
#results, #intro{
width:calc(100% - 371px);
margin:10px 0 0 0;
}
.result{
margin:10px 0 0 0;
}
.sentence{
margin:10px 0 0 0;
}
.sentence strong{
color:#800000;
}
#txt-list{
position: absolute;
width:200px;
right: 0px;
top:-7px;
margin:20px;
font-size: 12px;
color:#800000;
}
#txt-list ul{
margin:0;
padding:0;
}
#txt-list ul li{
margin:0;
padding:0;
text-indent: -38px;
list-style: none;
word-break: break-all;
}

View File

@ -0,0 +1,134 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="123"
height="20"
viewBox="0 0 32.543769 5.291667"
version="1.1"
id="svg6930"
inkscape:version="0.92.1 r15371"
sodipodi:docname="Grrrrrrrrrrls.svg">
<defs
id="defs6924" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="1.4"
inkscape:cx="119.50258"
inkscape:cy="137.20382"
inkscape:document-units="mm"
inkscape:current-layer="flowRoot6291"
showgrid="false"
units="px"
inkscape:window-width="1280"
inkscape:window-height="701"
inkscape:window-x="0"
inkscape:window-y="1052"
inkscape:window-maximized="1" />
<metadata
id="metadata6927">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title></dc:title>
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(0,-291.70832)">
<g
aria-label="Grrrrrrrrrrls"
transform="matrix(0.21160622,0,0,0.21160622,-5.1292084,281.28732)"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;text-align:end;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;stroke-width:4.65293169;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
id="flowRoot6291">
<g
id="g105"
transform="matrix(0.72343932,0,0,0.72343932,2.5711799,25.676847)"
style="stroke-width:6.43168211">
<path
id="path65"
style="fill:#800000;stroke-width:6.43168211"
d="M 57.352636,47.991785 H 46.244337 v 4.755265 h 6.314992 c -0.152169,1.559727 -0.53259,2.51078 -1.369517,3.499876 -1.369516,1.673853 -3.461833,2.662948 -5.668276,2.662948 -4.603096,0 -7.912761,-4.032465 -7.912761,-9.738783 0,-5.972613 2.929243,-9.586615 7.760593,-9.586615 1.97819,0 3.652043,0.570632 4.907433,1.673854 0.798885,0.684758 1.217348,1.331474 1.711896,2.777074 h 5.363939 c -0.684758,-5.706318 -5.325897,-9.320319 -12.02131,-9.320319 -7.988846,0 -13.428869,5.896528 -13.428869,14.53209 0,8.407309 5.478065,14.532091 12.972363,14.532091 3.728128,0 6.238908,-1.331475 8.369267,-4.450929 l 0.684758,3.652044 h 3.423791 z"
inkscape:connector-curvature="0" />
<path
id="path67"
style="fill:#ff0000;stroke-width:6.43168211"
d="m 62.273147,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646717,-0.03804 -2.434695,0 -4.565054,1.597769 -5.706318,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path69"
style="fill:#808000;stroke-width:6.43168211"
d="m 77.059049,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646716,-0.03804 -2.434696,0 -4.565055,1.597769 -5.706319,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path71"
style="fill:#ffff00;stroke-width:6.43168211"
d="M 91.844953,42.361551 V 62.904297 H 97.17085 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.565056,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path73"
style="fill:#008000;stroke-width:6.43168211"
d="m 106.63085,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path75"
style="fill:#00ff00;stroke-width:6.43168211"
d="m 121.41676,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67919,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path77"
style="fill:#008080;stroke-width:6.43168211"
d="m 136.20266,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49455,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path79"
style="fill:#00ffff;stroke-width:6.43168211"
d="m 150.98856,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path81"
style="fill:#000080;stroke-width:6.43168211"
d="m 165.77446,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path83"
style="fill:#0000ff;stroke-width:6.43168211"
d="m 180.56037,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57064,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path85"
style="fill:#800080;stroke-width:6.43168211"
d="m 195.34627,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55972,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70631,4.374844 v -4.032465 z"
inkscape:connector-curvature="0" />
<path
id="path87"
style="fill:#ff00ff;stroke-width:6.43168211"
d="m 215.61024,35.17159 h -5.3259 v 27.732707 h 5.3259 z"
inkscape:connector-curvature="0" />
<path
id="path89"
style="fill:#2b0000;stroke-width:6.43168211"
d="m 237.49754,48.98088 c -0.0761,-4.336802 -3.42379,-6.961708 -8.9399,-6.961708 -5.21177,0 -8.44535,2.624906 -8.44535,6.847582 0,1.369516 0.41847,2.548822 1.14127,3.347707 0.7228,0.7228 1.36951,1.065179 3.3477,1.711895 l 6.35304,1.97819 c 1.33147,0.418464 1.78798,0.836927 1.78798,1.673854 0,1.25539 -1.48365,2.016232 -3.95638,2.016232 -1.36952,0 -2.47274,-0.266295 -3.1575,-0.7228 -0.57063,-0.418463 -0.79888,-0.836927 -1.02714,-1.940148 h -5.21177 c 0.15217,4.48897 3.46183,6.847582 9.70074,6.847582 2.85316,0 5.02156,-0.608674 6.54325,-1.826022 1.52168,-1.217348 2.43469,-3.119454 2.43469,-5.135687 0,-2.662948 -1.33147,-4.374844 -4.03246,-5.173728 l -6.73346,-1.940148 c -1.48364,-0.456506 -1.86406,-0.760843 -1.86406,-1.59777 0,-1.141263 1.21735,-1.902106 3.08141,-1.902106 2.54882,0 3.80421,0.913011 3.84226,2.777075 z"
inkscape:connector-curvature="0" />
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 8.4 KiB

30
templates/base.html Normal file
View File

@ -0,0 +1,30 @@
<!DOCTYPE html>
<html lang='en'>
<head>
<meta charset="utf-8" />
<title>Grrrrrrrrrrls {% block title %}{% endblock %}</title>
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" />
</head>
<body>
<div id="logo">
<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a>
</div>
<div id="search">
<form action="" method="GET">
<input id="query" name="q" value="{{query}}"/>
<input id="submit" type="submit" value="➜"/>
</form>
</div>
<div id="txt-list">
<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
<ul>
{% for txt in files %}
<li>{{txt}}</li>
{% endfor %}
<ul>
</div>
{% block results %}
{% endblock %}
</body>
</html>

8
templates/index.html Normal file
View File

@ -0,0 +1,8 @@
{% extends "base.html" %}
{% block results %}
<div id="intro">
<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p>
</div>
{% endblock %}

22
templates/results.html Normal file
View File

@ -0,0 +1,22 @@
{% extends "base.html" %}
{% block title %}{{query}}{% endblock %}
{% block results %}
<h1>The results for the query "{{query}}" are:</h1>
<div id="results">
{% if results == {} %}
<div>That word is not used in any of the manifesto's.</div>
{% else %}
{% for _, manifesto in results.items() %}
<div class="result">
<h2>{{manifesto.name}}</h2>
<div class="sentences">
{% for sentence in manifesto.sentences %}
<div class="sentence">{{sentence}}</div>
{% endfor %}
</div>
</div>
{% endfor %}
{% endif %}
</div>
{% endblock %}

145
tfidf.py Normal file
View File

@ -0,0 +1,145 @@
import os, json
from math import log, exp
from flask import Markup
from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
import pprint
pp = pprint.PrettyPrinter(indent=4)
def tfidf(query, manifesto, corpus):
# Term Frequency
tf_count = 0
for word in manifesto:
if query == word:
tf_count += 1
tf = tf_count/len(manifesto)
# print('count:', tf_count)
# print('total:', len(manifesto))
# print('TF - count/total', tf_count/len(manifesto))
# Inverse Document Frequency
idf_count = 0
for words in corpus:
if query in words:
idf_count += 1
# print('count:', idf_count)
idf = log(len(corpus)/idf_count)
# print('documents:', len(corpus))
# print('documents/count', len(corpus)/idf_count)
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
tfidf_value = tf * idf
# print('TF-IDF:', tfidf_value)
return tf_count, idf_count, tfidf_value
def load_text_files():
files = []
corpus = []
sentences = {}
dir = 'txt'
for f in sorted(os.listdir(dir)):
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
corpus.append(words) # all words of one manifesto, in reading order
s = sent_tokenize(lines)
manifesto = f.replace('.txt','')
sentences[manifesto] = s
files.append(manifesto) # list of filenames
print('*txt files loaded*')
return files, corpus, sentences
def create_index():
files, corpus, sentences = load_text_files()
index = {}
# index = {
# Fem manifesto : {
# 'words' : {
# 'aap': 39.2,
# 'beer': 20.456,
# 'citroen': 3.21
# }
# }
# }
for i, words in enumerate(corpus):
manifesto = files[i]
index[manifesto] = {}
for word in words:
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
if 'words' not in index[manifesto]:
index[manifesto]['words'] = {}
index[manifesto]['words'][word] = tfidf_value
with open('index.json','w+') as out:
out.write(json.dumps(index, indent=4, sort_keys=True))
out.close()
print('*index created*')
def load_index():
f = open('index.json').read()
index = json.loads(f)
return index
def request_results(query):
query = query.strip()
files, corpus, sentences = load_text_files()
f = open('index.json').read()
index = json.loads(f)
results = {}
# results = {
# 0 : {
# 'name' : 'Fem_manifesto',
# 'value' : 0.00041,
# 'sentences' : [
# 'This is a first sentence.',
# 'This is a second sentence.',
# 'This is a third sentence.'
# ]
# }
# }
# make a list of manifesto's that use the query word
result_matches = []
for manifesto, d in index.items():
for word, value in d['words'].items():
if query == word:
result_matches.append([value, manifesto])
result_matches.sort(reverse=True)
for x, result in enumerate(result_matches):
results[x] = {}
results[x]['tfidf'] = result[0]
results[x]['name'] = result[1]
# pp.pprint(results)
# make a list of sentences that contain the query word
# and shape results object
for x, manifesto in results.items():
sents = sentences[manifesto['name']]
value = manifesto['tfidf'] * 10000
result_sentences = []
count = 0
for s in sents:
done = 'no'
for word in tokenizer.tokenize(s):
if word == query:
if count < 3: # set to include a max 3 results/manifesto in the results list
count += 1
if done is not 'yes':
sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query))
html = Markup(sentence)
result_sentences.append(html)
done = 'yes'
results[x]['sentences'] = result_sentences
print('*results returned*')
return results, index