first commit
This commit is contained in:
commit
d56ddde273
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
txt/*
|
||||
__pycache__/
|
33
README.md
Normal file
33
README.md
Normal file
@ -0,0 +1,33 @@
|
||||
# Grrrrrrrrrrls - search machine (prototype)
|
||||
|
||||
A small flask exercise, combining the TFIDF algorithm written in python with a web interface.
|
||||
|
||||
Grrrrrrrrrrls is a project in progress for the [Computer Grrrls](https://hmkv.de/programm/programmpunkte/2018/Ausstellungen/2018_GRLS.php) exhibition at the HMKV & La Gaîté Lyrique.
|
||||
|
||||
# Install
|
||||
|
||||
$ pip3 install flask
|
||||
|
||||
$ pip3 install nltk
|
||||
|
||||
# Txt documents
|
||||
|
||||
The search machine is using the index.json file to process results. The function 'create_index' can be called to generate this file. It uses a set of plain text files to index each word and its corresponding TFIDF value. The plain text files are not included in this repo, i don't think i can publish them like that.
|
||||
|
||||
If you want to work with another set of documents, make a 'txt/' folder, add a few txt files in it, and remove the index.json file (or rename it if you want to keep it with you).
|
||||
|
||||
# Start
|
||||
|
||||
Start the flask/python local server ...
|
||||
|
||||
$ python3 start.py
|
||||
|
||||
Browse to your localhost on port 5000 ...
|
||||
|
||||
> 127.0.0.1:5000
|
||||
|
||||
## Notes
|
||||
|
||||
This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with one word.
|
||||
|
||||
This is a prototype :)
|
15309
index.json
Normal file
15309
index.json
Normal file
File diff suppressed because it is too large
Load Diff
40
start.py
Normal file
40
start.py
Normal file
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import flask
|
||||
from flask import request
|
||||
import tfidf
|
||||
|
||||
def get_index():
|
||||
index = tfidf.load_index()
|
||||
return index
|
||||
|
||||
def get_results(query):
|
||||
results, index = tfidf.request_results(query)
|
||||
return results, index
|
||||
|
||||
# Create the application.
|
||||
APP = flask.Flask(__name__)
|
||||
|
||||
@APP.route('/', methods=['GET', 'POST'])
|
||||
def index():
|
||||
""" Displays the index page accessible at '/'
|
||||
"""
|
||||
query = None
|
||||
results = None
|
||||
|
||||
if request.args.get('q', ''):
|
||||
query = request.args.get('q', '')
|
||||
results, index = get_results(query)
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
return flask.render_template('results.html', query=query, results=results, files=files)
|
||||
else:
|
||||
index = get_index()
|
||||
files = [manifesto for manifesto, _ in index.items()]
|
||||
return flask.render_template('index.html', files=files)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not 'index.json' in os.listdir('.'):
|
||||
tfidf.create_index()
|
||||
APP.debug=True
|
||||
APP.run()
|
81
static/css/stylesheet.css
Normal file
81
static/css/stylesheet.css
Normal file
@ -0,0 +1,81 @@
|
||||
body{
|
||||
background-color: rgba(220,220,220,0.6);
|
||||
margin:20px;
|
||||
font-family: sans-serif;
|
||||
font-size: 14px;
|
||||
}
|
||||
h1, h2, h3{
|
||||
font-size: 100%;
|
||||
margin:30px 0 0 0;
|
||||
}
|
||||
h2{
|
||||
font-size: 12px;
|
||||
font-weight: normal;
|
||||
border-bottom:1px solid;
|
||||
}
|
||||
#logo, #search{
|
||||
display: inline-block;
|
||||
}
|
||||
#logo{
|
||||
margin:15px 0;
|
||||
}
|
||||
#search{
|
||||
position: relative;
|
||||
width: 300px;
|
||||
margin:0 0 0 20px;
|
||||
top:-3px;
|
||||
}
|
||||
#search input#query{
|
||||
width: 100%;
|
||||
height: 42px;
|
||||
padding:0px 10px;
|
||||
border:1px solid rgba(190,190,190,1);
|
||||
vertical-align: baseline;
|
||||
}
|
||||
#search #submit{
|
||||
position: absolute;
|
||||
width: 26px;
|
||||
height: 26px;
|
||||
right: -12px;
|
||||
top:9px;
|
||||
border:0;
|
||||
border-radius: 100%;
|
||||
background-color:transparent;
|
||||
text-align: center;
|
||||
}
|
||||
#search #submit:hover{
|
||||
cursor: pointer;
|
||||
}
|
||||
#results, #intro{
|
||||
width:calc(100% - 371px);
|
||||
margin:10px 0 0 0;
|
||||
}
|
||||
.result{
|
||||
margin:10px 0 0 0;
|
||||
}
|
||||
.sentence{
|
||||
margin:10px 0 0 0;
|
||||
}
|
||||
.sentence strong{
|
||||
color:#800000;
|
||||
}
|
||||
#txt-list{
|
||||
position: absolute;
|
||||
width:200px;
|
||||
right: 0px;
|
||||
top:-7px;
|
||||
margin:20px;
|
||||
font-size: 12px;
|
||||
color:#800000;
|
||||
}
|
||||
#txt-list ul{
|
||||
margin:0;
|
||||
padding:0;
|
||||
}
|
||||
#txt-list ul li{
|
||||
margin:0;
|
||||
padding:0;
|
||||
text-indent: -38px;
|
||||
list-style: none;
|
||||
word-break: break-all;
|
||||
}
|
134
static/images/Grrrrrrrrrrls.svg
Normal file
134
static/images/Grrrrrrrrrrls.svg
Normal file
@ -0,0 +1,134 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="123"
|
||||
height="20"
|
||||
viewBox="0 0 32.543769 5.291667"
|
||||
version="1.1"
|
||||
id="svg6930"
|
||||
inkscape:version="0.92.1 r15371"
|
||||
sodipodi:docname="Grrrrrrrrrrls.svg">
|
||||
<defs
|
||||
id="defs6924" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="1.4"
|
||||
inkscape:cx="119.50258"
|
||||
inkscape:cy="137.20382"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="flowRoot6291"
|
||||
showgrid="false"
|
||||
units="px"
|
||||
inkscape:window-width="1280"
|
||||
inkscape:window-height="701"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="1052"
|
||||
inkscape:window-maximized="1" />
|
||||
<metadata
|
||||
id="metadata6927">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(0,-291.70832)">
|
||||
<g
|
||||
aria-label="Grrrrrrrrrrls"
|
||||
transform="matrix(0.21160622,0,0,0.21160622,-5.1292084,281.28732)"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;text-align:end;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#000000;fill-opacity:1;stroke:none;stroke-width:4.65293169;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
id="flowRoot6291">
|
||||
<g
|
||||
id="g105"
|
||||
transform="matrix(0.72343932,0,0,0.72343932,2.5711799,25.676847)"
|
||||
style="stroke-width:6.43168211">
|
||||
<path
|
||||
id="path65"
|
||||
style="fill:#800000;stroke-width:6.43168211"
|
||||
d="M 57.352636,47.991785 H 46.244337 v 4.755265 h 6.314992 c -0.152169,1.559727 -0.53259,2.51078 -1.369517,3.499876 -1.369516,1.673853 -3.461833,2.662948 -5.668276,2.662948 -4.603096,0 -7.912761,-4.032465 -7.912761,-9.738783 0,-5.972613 2.929243,-9.586615 7.760593,-9.586615 1.97819,0 3.652043,0.570632 4.907433,1.673854 0.798885,0.684758 1.217348,1.331474 1.711896,2.777074 h 5.363939 c -0.684758,-5.706318 -5.325897,-9.320319 -12.02131,-9.320319 -7.988846,0 -13.428869,5.896528 -13.428869,14.53209 0,8.407309 5.478065,14.532091 12.972363,14.532091 3.728128,0 6.238908,-1.331475 8.369267,-4.450929 l 0.684758,3.652044 h 3.423791 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path67"
|
||||
style="fill:#ff0000;stroke-width:6.43168211"
|
||||
d="m 62.273147,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646717,-0.03804 -2.434695,0 -4.565054,1.597769 -5.706318,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path69"
|
||||
style="fill:#808000;stroke-width:6.43168211"
|
||||
d="m 77.059049,42.361551 v 20.542746 h 5.325897 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.679181,-4.679181 0.570632,0 0.951053,0.03804 1.673854,0.152168 v -5.401981 c -0.304337,-0.03804 -0.494548,-0.03804 -0.646716,-0.03804 -2.434696,0 -4.565055,1.597769 -5.706319,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path71"
|
||||
style="fill:#ffff00;stroke-width:6.43168211"
|
||||
d="M 91.844953,42.361551 V 62.904297 H 97.17085 V 51.986208 c 0,-3.119454 1.559727,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.565056,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path73"
|
||||
style="fill:#008000;stroke-width:6.43168211"
|
||||
d="m 106.63085,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path75"
|
||||
style="fill:#00ff00;stroke-width:6.43168211"
|
||||
d="m 121.41676,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67919,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path77"
|
||||
style="fill:#008080;stroke-width:6.43168211"
|
||||
d="m 136.20266,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49455,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path79"
|
||||
style="fill:#00ffff;stroke-width:6.43168211"
|
||||
d="m 150.98856,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path81"
|
||||
style="fill:#000080;stroke-width:6.43168211"
|
||||
d="m 165.77446,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30433,-0.03804 -0.49454,-0.03804 -0.64671,-0.03804 -2.4347,0 -4.56506,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path83"
|
||||
style="fill:#0000ff;stroke-width:6.43168211"
|
||||
d="m 180.56037,42.361551 v 20.542746 h 5.32589 V 51.986208 c 0,-3.119454 1.55973,-4.679181 4.67918,-4.679181 0.57064,0 0.95106,0.03804 1.67386,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.4347,0 -4.56505,1.597769 -5.70632,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path85"
|
||||
style="fill:#800080;stroke-width:6.43168211"
|
||||
d="m 195.34627,42.361551 v 20.542746 h 5.3259 V 51.986208 c 0,-3.119454 1.55972,-4.679181 4.67918,-4.679181 0.57063,0 0.95105,0.03804 1.67385,0.152168 v -5.401981 c -0.30434,-0.03804 -0.49455,-0.03804 -0.64672,-0.03804 -2.43469,0 -4.56505,1.597769 -5.70631,4.374844 v -4.032465 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path87"
|
||||
style="fill:#ff00ff;stroke-width:6.43168211"
|
||||
d="m 215.61024,35.17159 h -5.3259 v 27.732707 h 5.3259 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
id="path89"
|
||||
style="fill:#2b0000;stroke-width:6.43168211"
|
||||
d="m 237.49754,48.98088 c -0.0761,-4.336802 -3.42379,-6.961708 -8.9399,-6.961708 -5.21177,0 -8.44535,2.624906 -8.44535,6.847582 0,1.369516 0.41847,2.548822 1.14127,3.347707 0.7228,0.7228 1.36951,1.065179 3.3477,1.711895 l 6.35304,1.97819 c 1.33147,0.418464 1.78798,0.836927 1.78798,1.673854 0,1.25539 -1.48365,2.016232 -3.95638,2.016232 -1.36952,0 -2.47274,-0.266295 -3.1575,-0.7228 -0.57063,-0.418463 -0.79888,-0.836927 -1.02714,-1.940148 h -5.21177 c 0.15217,4.48897 3.46183,6.847582 9.70074,6.847582 2.85316,0 5.02156,-0.608674 6.54325,-1.826022 1.52168,-1.217348 2.43469,-3.119454 2.43469,-5.135687 0,-2.662948 -1.33147,-4.374844 -4.03246,-5.173728 l -6.73346,-1.940148 c -1.48364,-0.456506 -1.86406,-0.760843 -1.86406,-1.59777 0,-1.141263 1.21735,-1.902106 3.08141,-1.902106 2.54882,0 3.80421,0.913011 3.84226,2.777075 z"
|
||||
inkscape:connector-curvature="0" />
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 8.4 KiB |
30
templates/base.html
Normal file
30
templates/base.html
Normal file
@ -0,0 +1,30 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang='en'>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Grrrrrrrrrrls {% block title %}{% endblock %}</title>
|
||||
<link type="text/css" rel="stylesheet" href="{{ url_for('static', filename='css/stylesheet.css')}}" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="logo">
|
||||
<a href="{{ url_for('index')}}"><img src="{{ url_for('static', filename='images/Grrrrrrrrrrls.svg')}}"></a>
|
||||
</div>
|
||||
<div id="search">
|
||||
<form action="" method="GET">
|
||||
<input id="query" name="q" value="{{query}}"/>
|
||||
<input id="submit" type="submit" value="➜"/>
|
||||
</form>
|
||||
</div>
|
||||
<div id="txt-list">
|
||||
<p>Searching through <em>and calculating words of</em> the following txt documents:</p>
|
||||
|
||||
<ul>
|
||||
{% for txt in files %}
|
||||
<li>{{txt}}</li>
|
||||
{% endfor %}
|
||||
<ul>
|
||||
</div>
|
||||
{% block results %}
|
||||
{% endblock %}
|
||||
</body>
|
||||
</html>
|
8
templates/index.html
Normal file
8
templates/index.html
Normal file
@ -0,0 +1,8 @@
|
||||
{% extends "base.html" %}
|
||||
|
||||
{% block results %}
|
||||
<div id="intro">
|
||||
<p>This Grrrrrrrrrrls search machine cannot handle too much at once: it can only work with <strong>one word</strong>.</p>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
22
templates/results.html
Normal file
22
templates/results.html
Normal file
@ -0,0 +1,22 @@
|
||||
{% extends "base.html" %}
|
||||
{% block title %}{{query}}{% endblock %}
|
||||
{% block results %}
|
||||
<h1>The results for the query "{{query}}" are:</h1>
|
||||
<div id="results">
|
||||
{% if results == {} %}
|
||||
<div>That word is not used in any of the manifesto's.</div>
|
||||
{% else %}
|
||||
{% for _, manifesto in results.items() %}
|
||||
|
||||
<div class="result">
|
||||
<h2>{{manifesto.name}}</h2>
|
||||
<div class="sentences">
|
||||
{% for sentence in manifesto.sentences %}
|
||||
<div class="sentence">{{sentence}}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endblock %}
|
145
tfidf.py
Normal file
145
tfidf.py
Normal file
@ -0,0 +1,145 @@
|
||||
import os, json
|
||||
from math import log, exp
|
||||
from flask import Markup
|
||||
|
||||
from nltk import sent_tokenize
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
tokenizer = RegexpTokenizer(r'\w+') # initialize tokenizer
|
||||
|
||||
import pprint
|
||||
pp = pprint.PrettyPrinter(indent=4)
|
||||
|
||||
def tfidf(query, manifesto, corpus):
|
||||
# Term Frequency
|
||||
tf_count = 0
|
||||
for word in manifesto:
|
||||
if query == word:
|
||||
tf_count += 1
|
||||
tf = tf_count/len(manifesto)
|
||||
# print('count:', tf_count)
|
||||
# print('total:', len(manifesto))
|
||||
# print('TF - count/total', tf_count/len(manifesto))
|
||||
|
||||
# Inverse Document Frequency
|
||||
idf_count = 0
|
||||
for words in corpus:
|
||||
if query in words:
|
||||
idf_count += 1
|
||||
# print('count:', idf_count)
|
||||
idf = log(len(corpus)/idf_count)
|
||||
# print('documents:', len(corpus))
|
||||
# print('documents/count', len(corpus)/idf_count)
|
||||
# print('IDF - log(documents/count)', log(len(corpus)/idf_count))
|
||||
|
||||
tfidf_value = tf * idf
|
||||
# print('TF-IDF:', tfidf_value)
|
||||
|
||||
return tf_count, idf_count, tfidf_value
|
||||
|
||||
def load_text_files():
|
||||
files = []
|
||||
corpus = []
|
||||
sentences = {}
|
||||
dir = 'txt'
|
||||
|
||||
for f in sorted(os.listdir(dir)):
|
||||
lines = open(dir+'/'+f, "r").read() # list of lines in .txt file
|
||||
words = [word for word in tokenizer.tokenize(lines)] # tokenize words, without punctuation
|
||||
corpus.append(words) # all words of one manifesto, in reading order
|
||||
s = sent_tokenize(lines)
|
||||
manifesto = f.replace('.txt','')
|
||||
sentences[manifesto] = s
|
||||
files.append(manifesto) # list of filenames
|
||||
|
||||
print('*txt files loaded*')
|
||||
return files, corpus, sentences
|
||||
|
||||
def create_index():
|
||||
files, corpus, sentences = load_text_files()
|
||||
index = {}
|
||||
|
||||
# index = {
|
||||
# Fem manifesto : {
|
||||
# 'words' : {
|
||||
# 'aap': 39.2,
|
||||
# 'beer': 20.456,
|
||||
# 'citroen': 3.21
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
for i, words in enumerate(corpus):
|
||||
manifesto = files[i]
|
||||
index[manifesto] = {}
|
||||
for word in words:
|
||||
tf_count, idf_count, tfidf_value = tfidf(word, words, corpus)
|
||||
if 'words' not in index[manifesto]:
|
||||
index[manifesto]['words'] = {}
|
||||
index[manifesto]['words'][word] = tfidf_value
|
||||
|
||||
with open('index.json','w+') as out:
|
||||
out.write(json.dumps(index, indent=4, sort_keys=True))
|
||||
out.close()
|
||||
print('*index created*')
|
||||
|
||||
def load_index():
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
return index
|
||||
|
||||
def request_results(query):
|
||||
query = query.strip()
|
||||
files, corpus, sentences = load_text_files()
|
||||
f = open('index.json').read()
|
||||
index = json.loads(f)
|
||||
results = {}
|
||||
|
||||
# results = {
|
||||
# 0 : {
|
||||
# 'name' : 'Fem_manifesto',
|
||||
# 'value' : 0.00041,
|
||||
# 'sentences' : [
|
||||
# 'This is a first sentence.',
|
||||
# 'This is a second sentence.',
|
||||
# 'This is a third sentence.'
|
||||
# ]
|
||||
# }
|
||||
# }
|
||||
|
||||
# make a list of manifesto's that use the query word
|
||||
result_matches = []
|
||||
for manifesto, d in index.items():
|
||||
for word, value in d['words'].items():
|
||||
if query == word:
|
||||
result_matches.append([value, manifesto])
|
||||
|
||||
result_matches.sort(reverse=True)
|
||||
for x, result in enumerate(result_matches):
|
||||
results[x] = {}
|
||||
results[x]['tfidf'] = result[0]
|
||||
results[x]['name'] = result[1]
|
||||
|
||||
# pp.pprint(results)
|
||||
|
||||
# make a list of sentences that contain the query word
|
||||
# and shape results object
|
||||
for x, manifesto in results.items():
|
||||
sents = sentences[manifesto['name']]
|
||||
value = manifesto['tfidf'] * 10000
|
||||
result_sentences = []
|
||||
count = 0
|
||||
for s in sents:
|
||||
done = 'no'
|
||||
for word in tokenizer.tokenize(s):
|
||||
if word == query:
|
||||
if count < 3: # set to include a max 3 results/manifesto in the results list
|
||||
count += 1
|
||||
if done is not 'yes':
|
||||
sentence = s.replace(query, '<strong style="font-size:{}px;">{}</strong>'.format(value, query))
|
||||
html = Markup(sentence)
|
||||
result_sentences.append(html)
|
||||
done = 'yes'
|
||||
results[x]['sentences'] = result_sentences
|
||||
|
||||
print('*results returned*')
|
||||
return results, index
|
Loading…
Reference in New Issue
Block a user