reading text from single pdf, step 1

This commit is contained in:
crunk 2022-05-01 21:08:33 +02:00
parent 833e7e88a5
commit 75a126df4b
4 changed files with 37 additions and 0 deletions

BIN
exp/files/74ahc138.pdf Normal file

Binary file not shown.

34
exp/readtxt/readtxt.go Normal file
View File

@ -0,0 +1,34 @@
package main
import (
"bytes"
"fmt"
"github.com/ledongthuc/pdf"
)
func main() {
pdf.DebugOn = true
content, err := readPdf("../files/74ahc138.pdf") // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
// remember close file
defer f.Close()
if err != nil {
return "", err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}

1
go.mod
View File

@ -3,6 +3,7 @@ module varia.zone/go-sh-manymanuals
go 1.18 go 1.18
require ( require (
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 // indirect
github.com/otiai10/gosseract/v2 v2.3.1 // indirect github.com/otiai10/gosseract/v2 v2.3.1 // indirect
gopkg.in/gographics/imagick.v2 v2.6.0 // indirect gopkg.in/gographics/imagick.v2 v2.6.0 // indirect
) )

2
go.sum
View File

@ -1,3 +1,5 @@
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE= github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs= github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs=
github.com/otiai10/gosseract/v2 v2.3.1 h1:BFy9Rru7dzqEYX7/tJuEvjVPkkJck0f+b5fYzzr6/RM= github.com/otiai10/gosseract/v2 v2.3.1 h1:BFy9Rru7dzqEYX7/tJuEvjVPkkJck0f+b5fYzzr6/RM=