first commit

2018-08-31 10:26:51 +02:00 · 2018-08-31 10:26:51 +02:00 · d56ddde273
commit d56ddde273
10 changed files with 15804 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+txt/*
+__pycache__/
--- a/README.md
+++ b/README.md
@ -0,0 +1,33 @@
+# Grrrrrrrrrrls - search machine (prototype)
+
+A small flask exercise, combining the TFIDF algorithm written in python with a web interface.
+
+Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique.
+
+# Install
+
+   $ pip3 install flask 
+
+   $ pip3 install nltk
+
+# Txt documents 
+
+The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that. 
+
+If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you).
+
+# Start
+
+Start the flask/python local server ...
+
+    $ python3 start.py
+
+Browse to your localhost on port 5000 ...
+
+    > 127.0.0.1:5000
+
+## Notes
+
+This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word.
+
+This is a prototype :)
--- a/index.json
+++ b/index.json
--- a/start.py
+++ b/start.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import os
+import flask 
+from flask import request
+import tfidf
+
+def get_index():
+	index = tfidf.load_index()
+	return index
+
+def get_results(query):
+	results, index = tfidf.request_results(query)
+	return results, index
+
+# Create the application.
+APP = flask.Flask(__name__)
+   
+@APP.route('/', methods=['GET', 'POST'])
+def index():
+	""" Displays the index page accessible at '/'
+	"""
+	query = None
+	results = None
+
+	if request.args.get('q', ''):
+		query = request.args.get('q', '')
+		results, index = get_results(query)
+		files = [manifesto for manifesto, _ in index.items()]
+		return flask.render_template('results.html', query=query, results=results, files=files)
+	else:
+		index = get_index()
+		files = [manifesto for manifesto, _ in index.items()]
+		return flask.render_template('index.html', files=files)
+
+if __name__ == '__main__':
+	if not 'index.json' in os.listdir('.'):
+		tfidf.create_index()
+	APP.debug=True
+	APP.run()
--- a/static/css/stylesheet.css
+++ b/static/css/stylesheet.css
@ -0,0 +1,81 @@
+body{
+	background-color: rgba(220,220,220,0.6);
+	margin:20px;
+	font-family: sans-serif;
+	font-size: 14px;
+}
+h1, h2, h3{
+	font-size: 100%;
+	margin:30px 0 0 0;
+}
+	h2{
+		font-size: 12px;
+		font-weight: normal;
+		border-bottom:1px solid;
+	}
+#logo, #search{
+	display: inline-block;
+}
+#logo{
+	margin:15px 0;
+}
+#search{
+	position: relative;
+	width: 300px;
+	margin:0 0 0 20px;
+	top:-3px;
+}
+	#search input#query{
+		width: 100%;
+		height: 42px;
+		padding:0px 10px;
+		border:1px solid rgba(190,190,190,1);
+		vertical-align: baseline;
+	}
+	#search #submit{
+		position: absolute;
+		width: 26px;
+		height: 26px;
+		right: -12px;
+		top:9px;
+		border:0;
+		border-radius: 100%;
+		background-color:transparent;
+		text-align: center;
+	}
+	#search #submit:hover{
+		cursor: pointer;
+	}
+#results, #intro{
+	width:calc(100% - 371px);
+	margin:10px 0 0 0;
+}
+	.result{
+		margin:10px 0 0 0;
+	}
+	.sentence{
+		margin:10px 0 0 0;
+	}
+		.sentence strong{
+			color:#800000;
+		}
+#txt-list{
+	position: absolute;
+	width:200px;
+	right: 0px;
+	top:-7px;
+	margin:20px;
+	font-size: 12px;
+	color:#800000;
+}
+	#txt-list ul{
+		margin:0;
+		padding:0;
+	}
+	#txt-list ul li{
+		margin:0;
+		padding:0;
+		text-indent: -38px;
+		list-style: none;
+		word-break: break-all;
+	}
--- a/static/images/Grrrrrrrrrrls.svg
+++ b/static/images/Grrrrrrrrrrls.svg
@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="123"
+   height="20"
+   viewBox="0 0 32.543769 5.291667"
+   version="1.1"
+   id="svg6930"
+   inkscape:version="0.92.1 r15371"
+   sodipodi:docname="Grrrrrrrrrrls.svg">
+  <defs
+     id="defs6924" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.4"
+     inkscape:cx="119.50258"
+     inkscape:cy="137.20382"
+     inkscape:document-units="mm"
+     inkscape:current-layer="flowRoot6291"
+     showgrid="false"
+     units="px"
+     inkscape:window-width="1280"
+     inkscape:window-height="701"
+     inkscape:window-x="0"
+     inkscape:window-y="1052"
+     inkscape:window-maximized="1" />
+  <metadata
+     id="metadata6927">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(0,-291.70832)">
+    <g
+       aria-label="Grrrrrrrrrrls"
+       transform="matrix(0.21160622,0,0,0.21160622,-5.1292084,281.28732)"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;text-align:end;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;stroke-width:4.65293169;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
+       id="flowRoot6291">
+      <g
+         id="g105"
+         transform="matrix(0.72343932,0,0,0.72343932,2.5711799,25.676847)"
+         style="stroke-width:6.43168211">
+        <path
+           id="path65"
+           style="fill:#800000;stroke-width:6.43168211"
+           d="M 57.352636,47.991785 H 46.244337 v 4.755265 h 6.314992 c -0.152169,1.559727 -0.53259,2.51078 -1.369517,3.499876 -1.369516,1.673853 -3.461833,2.662948 -5.668276,2.662948 -4.603096,0 -7.912761,-4.032465 -7.912761,-9.738783 0,-5.972613 2.929243,-9.586615 7.760593,-9.586615 1.97819,0 3.652043,0.570632 4.907433,1.673854 0.798885,0.684758 1.217348,1.331474 1.711896,2.777074 h 5.363939 c -0.684758,-5.706318 -5.325897,-9.320319 -12.02131,-9.320319 -7.988846,0 -13.428869,5.896528 -13.428869,14.53209 0,8.407309 5.478065,14.532091 12.972363,14.532091 3.728128,0 6.238908,-1.331475 8.369267,-4.450929 l 0.684758,3.652044 h 3.423791 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path67"
+           style="fill:#ff0000;stroke-width:6.43168211"
+           d="m 62.273147,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646717,-0.03804 -2.434695,0 -4.565054,1.597769 -5.706318,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path69"
+           style="fill:#808000;stroke-width:6.43168211"
+           d="m 77.059049,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646716,-0.03804 -2.434696,0 -4.565055,1.597769 -5.706319,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path71"
+           style="fill:#ffff00;stroke-width:6.43168211"
+           d="M 91.844953,42.361551 V 62.904297 H 97.17085 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.565056,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path73"
+           style="fill:#008000;stroke-width:6.43168211"
+           d="m 106.63085,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path75"
+           style="fill:#00ff00;stroke-width:6.43168211"
+           d="m 121.41676,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67919,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path77"
+           style="fill:#008080;stroke-width:6.43168211"
+           d="m 136.20266,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49455,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path79"
+           style="fill:#00ffff;stroke-width:6.43168211"
+           d="m 150.98856,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path81"
+           style="fill:#000080;stroke-width:6.43168211"
+           d="m 165.77446,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path83"
+           style="fill:#0000ff;stroke-width:6.43168211"
+           d="m 180.56037,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57064,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path85"
+           style="fill:#800080;stroke-width:6.43168211"
+           d="m 195.34627,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55972,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70631,4.374844 v -4.032465 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path87"
+           style="fill:#ff00ff;stroke-width:6.43168211"
+           d="m 215.61024,35.17159 h -5.3259 v 27.732707 h 5.3259 z"
+           inkscape:connector-curvature="0" />
+        <path
+           id="path89"
+           style="fill:#2b0000;stroke-width:6.43168211"
+           d="m 237.49754,48.98088 c -0.0761,-4.336802 -3.42379,-6.961708 -8.9399,-6.961708 -5.21177,0 -8.44535,2.624906 -8.44535,6.847582 0,1.369516 0.41847,2.548822 1.14127,3.347707 0.7228,0.7228 1.36951,1.065179 3.3477,1.711895 l 6.35304,1.97819 c 1.33147,0.418464 1.78798,0.836927 1.78798,1.673854 0,1.25539 -1.48365,2.016232 -3.95638,2.016232 -1.36952,0 -2.47274,-0.266295 -3.1575,-0.7228 -0.57063,-0.418463 -0.79888,-0.836927 -1.02714,-1.940148 h -5.21177 c 0.15217,4.48897 3.46183,6.847582 9.70074,6.847582 2.85316,0 5.02156,-0.608674 6.54325,-1.826022 1.52168,-1.217348 2.43469,-3.119454 2.43469,-5.135687 0,-2.662948 -1.33147,-4.374844 -4.03246,-5.173728 l -6.73346,-1.940148 c -1.48364,-0.456506 -1.86406,-0.760843 -1.86406,-1.59777 0,-1.141263 1.21735,-1.902106 3.08141,-1.902106 2.54882,0 3.80421,0.913011 3.84226,2.777075 z"
+           inkscape:connector-curvature="0" />
+      </g>
+    </g>
+  </g>
+</svg>
--- a/templates/base.html
+++ b/templates/base.html
@ -0,0 +1,30 @@
+<!DOCTYPE html>
+<html lang='en'>
+<head>
+  <meta charset="utf-8" />
+  <title>Grrrrrrrrrrls {% block title %}{% endblock %}</title>
+  <link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" />
+</head>
+<body>
+	<div id="logo">
+		<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a>
+	</div>
+	<div id="search">
+		<form action="" method="GET">
+			<input id="query" name="q" value="{{query}}"/>
+			<input id="submit" type="submit" value="➜"/>
+		</form>
+	</div>
+	<div id="txt-list">
+		<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
+
+		<ul>
+		{% for txt in files %}
+			<li>{{txt}}</li>
+		{% endfor %}
+		<ul>
+	</div>
+{% block results %}
+{% endblock %}
+</body>
+</html>
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,8 @@
+{% extends "base.html" %}
+
+{% block results %}
+<div id="intro">
+	<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p>
+</div>
+
+{% endblock %}
--- a/templates/results.html
+++ b/templates/results.html
@ -0,0 +1,22 @@
+{% extends "base.html" %}
+{% block title %}{{query}}{% endblock %}
+{% block results %}
+<h1>The results for the query "{{query}}" are:</h1>
+<div id="results">
+	{% if results == {} %}
+		<div>That word is not used in any of the manifesto's.</div>
+	{% else %}
+		{% for _, manifesto in results.items() %}
+			
+			<div class="result">
+				<h2>{{manifesto.name}}</h2>
+				<div class="sentences">
+					{% for sentence in manifesto.sentences %}
+						<div class="sentence">{{sentence}}</div>
+					{% endfor %}
+				</div>
+			</div>
+		{% endfor %}
+	{% endif %}
+</div>
+{% endblock %}
--- a/tfidf.py
+++ b/tfidf.py
@ -0,0 +1,145 @@
+import os, json
+from math import log, exp
+from flask import Markup
+
+from nltk import sent_tokenize
+from nltk.tokenize import RegexpTokenizer
+tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
+
+import pprint
+pp = pprint.PrettyPrinter(indent=4)
+
+def tfidf(query, manifesto, corpus):
+	# Term Frequency
+	tf_count = 0
+	for word in manifesto:
+		if query == word:
+			tf_count += 1
+	tf = tf_count/len(manifesto)
+	# print('count:', tf_count)
+	# print('total:', len(manifesto))
+	# print('TF - count/total', tf_count/len(manifesto))
+
+	# Inverse Document Frequency
+	idf_count = 0
+	for words in corpus:
+		if query in words:
+			idf_count += 1
+	# print('count:', idf_count)
+	idf = log(len(corpus)/idf_count)
+	# print('documents:', len(corpus))
+	# print('documents/count', len(corpus)/idf_count)
+	# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
+
+	tfidf_value = tf * idf
+	# print('TF-IDF:', tfidf_value)
+
+	return tf_count, idf_count, tfidf_value 
+
+def load_text_files():
+	files = []
+	corpus = []
+	sentences = {}
+	dir = 'txt'
+
+	for f in sorted(os.listdir(dir)):
+		lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
+		words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
+		corpus.append(words) # all words of one manifesto, in reading order
+		s = sent_tokenize(lines)
+		manifesto = f.replace('.txt','')
+		sentences[manifesto] = s
+		files.append(manifesto) # list of filenames
+
+	print('*txt files loaded*')
+	return files, corpus, sentences
+
+def create_index():
+	files, corpus, sentences = load_text_files()
+	index = {}
+
+	# index = {
+	# 	Fem manifesto : {
+	# 		'words' : {
+	# 			'aap': 39.2,
+	# 			'beer': 20.456,
+	# 			'citroen': 3.21
+	# 		}
+	# 	}
+	# }
+
+	for i, words in enumerate(corpus):
+		manifesto = files[i]
+		index[manifesto] = {}
+		for word in words:
+			tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
+			if 'words' not in index[manifesto]:
+				index[manifesto]['words'] = {}
+			index[manifesto]['words'][word] = tfidf_value
+	
+	with open('index.json','w+') as out:
+		out.write(json.dumps(index, indent=4, sort_keys=True))
+		out.close()
+	print('*index created*')
+
+def load_index():
+	f = open('index.json').read()
+	index = json.loads(f)
+	return index
+
+def request_results(query):
+	query = query.strip()
+	files, corpus, sentences = load_text_files()
+	f = open('index.json').read()
+	index = json.loads(f)
+	results = {}
+
+	# results = {
+	# 	0 : {
+	#       'name' : 'Fem_manifesto',
+	# 		'value' : 0.00041, 
+	# 		'sentences' : [
+	# 			'This is a first sentence.',
+	# 			'This is a second sentence.',
+	# 			'This is a third sentence.'
+	# 		]
+	# 	}
+	# }
+
+	# make a list of manifesto's that use the query word
+	result_matches = []
+	for manifesto, d in index.items():
+		for word, value in d['words'].items():
+			if query == word:
+				result_matches.append([value, manifesto])
+
+	result_matches.sort(reverse=True)
+	for x, result in enumerate(result_matches):
+		results[x] = {}
+		results[x]['tfidf'] = result[0]
+		results[x]['name'] = result[1]
+
+	# pp.pprint(results)
+
+	# make a list of sentences that contain the query word
+	# and shape results object
+	for x, manifesto in results.items():
+		sents = sentences[manifesto['name']]
+		value = manifesto['tfidf'] * 10000
+		result_sentences = []
+		count = 0
+		for s in sents:
+			done = 'no'
+			for word in tokenizer.tokenize(s):
+				if word == query:
+					if count < 3: # set to include a max 3 results/manifesto in the results list
+						count += 1
+						if done is not 'yes':
+							sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query))
+							html = Markup(sentence)
+							result_sentences.append(html)
+							done = 'yes'
+		results[x]['sentences'] = result_sentences
+
+	print('*results returned*')
+	return results, index