first commit

6 years ago · d56ddde273
10 changed files with 15804 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 txt/*
 __pycache__/
--- a/README.md
+++ b/README.md
@ -0,0 +1,33 @@
 # Grrrrrrrrrrls - search machine (prototype)
 A small flask exercise, combining the TFIDF algorithm written in python with a web interface.
 Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique.
 # Install
   $ pip3 install flask 
   $ pip3 install nltk
 # Txt documents 
 The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that. 
 If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you).
 # Start
 Start the flask/python local server ...
    $ python3 start.py
 Browse to your localhost on port 5000 ...
    > 127.0.0.1:5000
 ## Notes
 This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word.
 This is a prototype :)
--- a/index.json
+++ b/index.json
--- a/start.py
+++ b/start.py
@ -0,0 +1,40 @@
 #!/usr/bin/env python3
 import os
 import flask 
 from flask import request
 import tfidf
 def get_index():
 	index = tfidf.load_index()
 	return index
 def get_results(query):
 	results, index = tfidf.request_results(query)
 	return results, index
 # Create the application.
 APP = flask.Flask(__name__)
@APP.route('/', methods=['GET', 'POST'])
 def index():
 	""" Displays the index page accessible at '/'
 	"""
 	query = None
 	results = None
 	if request.args.get('q', ''):
 		query = request.args.get('q', '')
 		results, index = get_results(query)
 		files = [manifesto for manifesto, _ in index.items()]
 		return flask.render_template('results.html', query=query, results=results, files=files)
 	else:
 		index = get_index()
 		files = [manifesto for manifesto, _ in index.items()]
 		return flask.render_template('index.html', files=files)
 if __name__ == '__main__':
 	if not 'index.json' in os.listdir('.'):
 		tfidf.create_index()
 	APP.debug=True
 	APP.run()
--- a/static/css/stylesheet.css
+++ b/static/css/stylesheet.css
@ -0,0 +1,81 @@
 body{
 	background-color: rgba(220,220,220,0.6);
 	margin:20px;
 	font-family: sans-serif;
 	font-size: 14px;
 }
 h1, h2, h3{
 	font-size: 100%;
 	margin:30px 0 0 0;
 }
 	h2{
 		font-size: 12px;
 		font-weight: normal;
 		border-bottom:1px solid;
 	}
 #logo, #search{
 	display: inline-block;
 }
 #logo{
 	margin:15px 0;
 }
 #search{
 	position: relative;
 	width: 300px;
 	margin:0 0 0 20px;
 	top:-3px;
 }
 	#search input#query{
 		width: 100%;
 		height: 42px;
 		padding:0px 10px;
 		border:1px solid rgba(190,190,190,1);
 		vertical-align: baseline;
 	}
 	#search #submit{
 		position: absolute;
 		width: 26px;
 		height: 26px;
 		right: -12px;
 		top:9px;
 		border:0;
 		border-radius: 100%;
 		background-color:transparent;
 		text-align: center;
 	}
 	#search #submit:hover{
 		cursor: pointer;
 	}
 #results, #intro{
 	width:calc(100% - 371px);
 	margin:10px 0 0 0;
 }
 	.result{
 		margin:10px 0 0 0;
 	}
 	.sentence{
 		margin:10px 0 0 0;
 	}
 		.sentence strong{
 			color:#800000;
 		}
 #txt-list{
 	position: absolute;
 	width:200px;
 	right: 0px;
 	top:-7px;
 	margin:20px;
 	font-size: 12px;
 	color:#800000;
 }
 	#txt-list ul{
 		margin:0;
 		padding:0;
 	}
 	#txt-list ul li{
 		margin:0;
 		padding:0;
 		text-indent: -38px;
 		list-style: none;
 		word-break: break-all;
 	}
--- a/static/images/Grrrrrrrrrrls.svg
+++ b/static/images/Grrrrrrrrrrls.svg
@ -0,0 +1,134 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <!-- Created with Inkscape (http://www.inkscape.org/) -->
 <svg
   xmlns:dc="http://purl.org/dc/elements/1.1/"
   xmlns:cc="http://creativecommons.org/ns#"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:svg="http://www.w3.org/2000/svg"
   xmlns="http://www.w3.org/2000/svg"
   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
   width="123"
   height="20"
   viewBox="0 0 32.543769 5.291667"
   version="1.1"
   id="svg6930"
   inkscape:version="0.92.1 r15371"
   sodipodi:docname="Grrrrrrrrrrls.svg">
  <defs
     id="defs6924" />
  <sodipodi:namedview
     id="base"
     pagecolor="#ffffff"
     bordercolor="#666666"
     borderopacity="1.0"
     inkscape:pageopacity="0.0"
     inkscape:pageshadow="2"
     inkscape:zoom="1.4"
     inkscape:cx="119.50258"
     inkscape:cy="137.20382"
     inkscape:document-units="mm"
     inkscape:current-layer="flowRoot6291"
     showgrid="false"
     units="px"
     inkscape:window-width="1280"
     inkscape:window-height="701"
     inkscape:window-x="0"
     inkscape:window-y="1052"
     inkscape:window-maximized="1" />
  <metadata
     id="metadata6927">
    <rdf:RDF>
      <cc:Work
         rdf:about="">
        <dc:format>image/svg+xml</dc:format>
        <dc:type
           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
        <dc:title></dc:title>
      </cc:Work>
    </rdf:RDF>
  </metadata>
  <g
     inkscape:label="Layer 1"
     inkscape:groupmode="layer"
     id="layer1"
     transform="translate(0,-291.70832)">
    <g
       aria-label="Grrrrrrrrrrls"
       transform="matrix(0.21160622,0,0,0.21160622,-5.1292084,281.28732)"
       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;text-align:end;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;stroke-width:4.65293169;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
       id="flowRoot6291">
      <g
         id="g105"
         transform="matrix(0.72343932,0,0,0.72343932,2.5711799,25.676847)"
         style="stroke-width:6.43168211">
        <path
           id="path65"
           style="fill:#800000;stroke-width:6.43168211"
           d="M 57.352636,47.991785 H 46.244337 v 4.755265 h 6.314992 c -0.152169,1.559727 -0.53259,2.51078 -1.369517,3.499876 -1.369516,1.673853 -3.461833,2.662948 -5.668276,2.662948 -4.603096,0 -7.912761,-4.032465 -7.912761,-9.738783 0,-5.972613 2.929243,-9.586615 7.760593,-9.586615 1.97819,0 3.652043,0.570632 4.907433,1.673854 0.798885,0.684758 1.217348,1.331474 1.711896,2.777074 h 5.363939 c -0.684758,-5.706318 -5.325897,-9.320319 -12.02131,-9.320319 -7.988846,0 -13.428869,5.896528 -13.428869,14.53209 0,8.407309 5.478065,14.532091 12.972363,14.532091 3.728128,0 6.238908,-1.331475 8.369267,-4.450929 l 0.684758,3.652044 h 3.423791 z"
           inkscape:connector-curvature="0" />
        <path
           id="path67"
           style="fill:#ff0000;stroke-width:6.43168211"
           d="m 62.273147,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646717,-0.03804 -2.434695,0 -4.565054,1.597769 -5.706318,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path69"
           style="fill:#808000;stroke-width:6.43168211"
           d="m 77.059049,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646716,-0.03804 -2.434696,0 -4.565055,1.597769 -5.706319,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path71"
           style="fill:#ffff00;stroke-width:6.43168211"
           d="M 91.844953,42.361551 V 62.904297 H 97.17085 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.565056,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path73"
           style="fill:#008000;stroke-width:6.43168211"
           d="m 106.63085,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path75"
           style="fill:#00ff00;stroke-width:6.43168211"
           d="m 121.41676,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67919,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path77"
           style="fill:#008080;stroke-width:6.43168211"
           d="m 136.20266,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49455,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path79"
           style="fill:#00ffff;stroke-width:6.43168211"
           d="m 150.98856,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path81"
           style="fill:#000080;stroke-width:6.43168211"
           d="m 165.77446,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path83"
           style="fill:#0000ff;stroke-width:6.43168211"
           d="m 180.56037,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57064,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path85"
           style="fill:#800080;stroke-width:6.43168211"
           d="m 195.34627,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55972,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70631,4.374844 v -4.032465 z"
           inkscape:connector-curvature="0" />
        <path
           id="path87"
           style="fill:#ff00ff;stroke-width:6.43168211"
           d="m 215.61024,35.17159 h -5.3259 v 27.732707 h 5.3259 z"
           inkscape:connector-curvature="0" />
        <path
           id="path89"
           style="fill:#2b0000;stroke-width:6.43168211"
           d="m 237.49754,48.98088 c -0.0761,-4.336802 -3.42379,-6.961708 -8.9399,-6.961708 -5.21177,0 -8.44535,2.624906 -8.44535,6.847582 0,1.369516 0.41847,2.548822 1.14127,3.347707 0.7228,0.7228 1.36951,1.065179 3.3477,1.711895 l 6.35304,1.97819 c 1.33147,0.418464 1.78798,0.836927 1.78798,1.673854 0,1.25539 -1.48365,2.016232 -3.95638,2.016232 -1.36952,0 -2.47274,-0.266295 -3.1575,-0.7228 -0.57063,-0.418463 -0.79888,-0.836927 -1.02714,-1.940148 h -5.21177 c 0.15217,4.48897 3.46183,6.847582 9.70074,6.847582 2.85316,0 5.02156,-0.608674 6.54325,-1.826022 1.52168,-1.217348 2.43469,-3.119454 2.43469,-5.135687 0,-2.662948 -1.33147,-4.374844 -4.03246,-5.173728 l -6.73346,-1.940148 c -1.48364,-0.456506 -1.86406,-0.760843 -1.86406,-1.59777 0,-1.141263 1.21735,-1.902106 3.08141,-1.902106 2.54882,0 3.80421,0.913011 3.84226,2.777075 z"
           inkscape:connector-curvature="0" />
      </g>
    </g>
  </g>
 </svg>
--- a/templates/base.html
+++ b/templates/base.html
@ -0,0 +1,30 @@
 <!DOCTYPE html>
 <html lang='en'>
 <head>
  <meta charset="utf-8" />
  <title>Grrrrrrrrrrls {% block title %}{% endblock %}</title>
  <link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" />
 </head>
 <body>
 	<div id="logo">
 		<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a>
 	</div>
 	<div id="search">
 		<form action="" method="GET">
 			<input id="query" name="q" value="{{query}}"/>
 			<input id="submit" type="submit" value="➜"/>
 		</form>
 	</div>
 	<div id="txt-list">
 		<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
 		<ul>
 		{% for txt in files %}
 			<li>{{txt}}</li>
 		{% endfor %}
 		<ul>
 	</div>
 {% block results %}
 {% endblock %}
 </body>
 </html>
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,8 @@
 {% extends "base.html" %}
 {% block results %}
 <div id="intro">
 	<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p>
 </div>
 {% endblock %}
--- a/templates/results.html
+++ b/templates/results.html
@ -0,0 +1,22 @@
 {% extends "base.html" %}
 {% block title %}{{query}}{% endblock %}
 {% block results %}
 <h1>The results for the query "{{query}}" are:</h1>
 <div id="results">
 	{% if results == {} %}
 		<div>That word is not used in any of the manifesto's.</div>
 	{% else %}
 		{% for _, manifesto in results.items() %}
 			<div class="result">
 				<h2>{{manifesto.name}}</h2>
 				<div class="sentences">
 					{% for sentence in manifesto.sentences %}
 						<div class="sentence">{{sentence}}</div>
 					{% endfor %}
 				</div>
 			</div>
 		{% endfor %}
 	{% endif %}
 </div>
 {% endblock %}
--- a/tfidf.py
+++ b/tfidf.py
@ -0,0 +1,145 @@
 import os, json
 from math import log, exp
 from flask import Markup
 from nltk import sent_tokenize
 from nltk.tokenize import RegexpTokenizer
 tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
 import pprint
 pp = pprint.PrettyPrinter(indent=4)
 def tfidf(query, manifesto, corpus):
 	# Term Frequency
 	tf_count = 0
 	for word in manifesto:
 		if query == word:
 			tf_count += 1
 	tf = tf_count/len(manifesto)
 	# print('count:', tf_count)
 	# print('total:', len(manifesto))
 	# print('TF - count/total', tf_count/len(manifesto))
 	# Inverse Document Frequency
 	idf_count = 0
 	for words in corpus:
 		if query in words:
 			idf_count += 1
 	# print('count:', idf_count)
 	idf = log(len(corpus)/idf_count)
 	# print('documents:', len(corpus))
 	# print('documents/count', len(corpus)/idf_count)
 	# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
 	tfidf_value = tf * idf
 	# print('TF-IDF:', tfidf_value)
 	return tf_count, idf_count, tfidf_value 
 def load_text_files():
 	files = []
 	corpus = []
 	sentences = {}
 	dir = 'txt'
 	for f in sorted(os.listdir(dir)):
 		lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
 		words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
 		corpus.append(words) # all words of one manifesto, in reading order
 		s = sent_tokenize(lines)
 		manifesto = f.replace('.txt','')
 		sentences[manifesto] = s
 		files.append(manifesto) # list of filenames
 	print('*txt files loaded*')
 	return files, corpus, sentences
 def create_index():
 	files, corpus, sentences = load_text_files()
 	index = {}
 	# index = {
 	# 	Fem manifesto : {
 	# 		'words' : {
 	# 			'aap': 39.2,
 	# 			'beer': 20.456,
 	# 			'citroen': 3.21
 	# 		}
 	# 	}
 	# }
 	for i, words in enumerate(corpus):
 		manifesto = files[i]
 		index[manifesto] = {}
 		for word in words:
 			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
 			if 'words' not in index[manifesto]:
 				index[manifesto]['words'] = {}
 			index[manifesto]['words'][word] = tfidf_value
 	with open('index.json','w+') as out:
 		out.write(json.dumps(index, indent=4, sort_keys=True))
 		out.close()
 	print('*index created*')
 def load_index():
 	f = open('index.json').read()
 	index = json.loads(f)
 	return index
 def request_results(query):
 	query = query.strip()
 	files, corpus, sentences = load_text_files()
 	f = open('index.json').read()
 	index = json.loads(f)
 	results = {}
 	# results = {
 	# 	0 : {
 	#       'name' : 'Fem_manifesto',
 	# 		'value' : 0.00041, 
 	# 		'sentences' : [
 	# 			'This is a first sentence.',
 	# 			'This is a second sentence.',
 	# 			'This is a third sentence.'
 	# 		]
 	# 	}
 	# }
 	# make a list of manifesto's that use the query word
 	result_matches = []
 	for manifesto, d in index.items():
 		for word, value in d['words'].items():
 			if query == word:
 				result_matches.append([value, manifesto])
 	result_matches.sort(reverse=True)
 	for x, result in enumerate(result_matches):
 		results[x] = {}
 		results[x]['tfidf'] = result[0]
 		results[x]['name'] = result[1]
 	# pp.pprint(results)
 	# make a list of sentences that contain the query word
 	# and shape results object
 	for x, manifesto in results.items():
 		sents = sentences[manifesto['name']]
 		value = manifesto['tfidf'] * 10000
 		result_sentences = []
 		count = 0
 		for s in sents:
 			done = 'no'
 			for word in tokenizer.tokenize(s):
 				if word == query:
 					if count < 3: # set to include a max 3 results/manifesto in the results list
 						count += 1
 						if done is not 'yes':
 							sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query))
 							html = Markup(sentence)
 							result_sentences.append(html)
 							done = 'yes'
 		results[x]['sentences'] = result_sentences
 	print('*results returned*')
 	return results, index