go-sh-manymanuals/exp/bleve.go

99 lines
1.8 KiB
Go
Raw Normal View History

2023-05-11 16:08:29 +02:00
package main
import (
"fmt"
"log"
"os"
"path/filepath"
"strings"
"github.com/blevesearch/bleve/v2"
pdf "github.com/johbar/go-poppler"
)
func readPDF(name string) (string, error) {
doc, err := pdf.Open(name)
if err != nil {
return "", err
}
defer doc.Close()
var txt string
for i := 0; i < doc.GetNPages(); i++ {
txt += doc.GetPage(i).Text()
}
return txt, nil
}
type datasheet struct {
filename string
filepath string
contents string
}
func main() {
// create or open a bleve index
var index bleve.Index
var err error
mapping := bleve.NewIndexMapping()
index, err = bleve.New("test.bleve", mapping)
if err != nil {
index, err = bleve.Open("test.bleve")
if err != nil {
log.Fatal(err)
}
}
// gather all datasheets - filename, filepath, contents
var datasheets []datasheet
if err := filepath.Walk("../datasheets", func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
name := info.Name()
if strings.HasSuffix(name, "pdf") {
contents, err := readPDF(path)
if err != nil {
return err
}
datasheet := datasheet{
filename: name,
filepath: path,
contents: contents,
}
datasheets = append(datasheets, datasheet)
}
return nil
}); err != nil {
log.Fatal(err)
}
// index by filename and by content. no idea if this is The Way To Go
for _, datasheet := range datasheets {
contents, err := readPDF(datasheet.filepath)
if err != nil {
log.Fatal(err)
}
if err := index.Index(datasheet.filename, contents); err != nil {
log.Fatal(err)
}
}
// query for something! change the string to test other possibilities
query := bleve.NewMatchQuery("Enhanced-Page-Mode")
search := bleve.NewSearchRequest(query)
searchResults, err := index.Search(search)
if err != nil {
log.Fatal(err)
}
// print out the results
fmt.Println(searchResults)
}