Browse Source

reading text from single pdf, step 1

main
crunk 2 years ago
parent
commit
75a126df4b
  1. BIN
      exp/files/74ahc138.pdf
  2. 34
      exp/readtxt/readtxt.go
  3. 1
      go.mod
  4. 2
      go.sum

BIN
exp/files/74ahc138.pdf

Binary file not shown.

34
exp/readtxt/readtxt.go

@ -0,0 +1,34 @@
package main
import (
"bytes"
"fmt"
"github.com/ledongthuc/pdf"
)
func main() {
pdf.DebugOn = true
content, err := readPdf("../files/74ahc138.pdf") // Read local pdf file
if err != nil {
panic(err)
}
fmt.Println(content)
return
}
func readPdf(path string) (string, error) {
f, r, err := pdf.Open(path)
// remember close file
defer f.Close()
if err != nil {
return "", err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return "", err
}
buf.ReadFrom(b)
return buf.String(), nil
}

1
go.mod

@ -3,6 +3,7 @@ module varia.zone/go-sh-manymanuals
go 1.18
require (
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 // indirect
github.com/otiai10/gosseract/v2 v2.3.1 // indirect
gopkg.in/gographics/imagick.v2 v2.6.0 // indirect
)

2
go.sum

@ -1,3 +1,5 @@
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs=
github.com/otiai10/gosseract/v2 v2.3.1 h1:BFy9Rru7dzqEYX7/tJuEvjVPkkJck0f+b5fYzzr6/RM=

Loading…
Cancel
Save