From 4f35017d9d02f61a2bed2978e090b31e2d8a6758 Mon Sep 17 00:00:00 2001 From: crunk Date: Mon, 25 Apr 2022 23:31:59 +0200 Subject: [PATCH] first go commit --- exp/README.md | 8 +++++ exp/gimmetxt/gimmetxt.go | 64 ++++++++++++++++++++++++++++++++++++++++ go.mod | 5 ++++ go.sum | 11 +++++++ 4 files changed, 88 insertions(+) create mode 100644 exp/gimmetxt/gimmetxt.go create mode 100644 go.sum diff --git a/exp/README.md b/exp/README.md index 6d4960a..5d2e339 100644 --- a/exp/README.md +++ b/exp/README.md @@ -5,3 +5,11 @@ ## `ls.go` Lists files in [`files`](./files/). Run with `go run ls.go`. + +## `gimmetxt.go` + +WIP of gosseract pdf OCR. its far from perfect. +* gosseract doesn't OCR pdfs only imagees +* tried using gographics imagick.v2 with imagemagick6 +* this only does one page of the PDF, its very slow and it doesn't even do the OCR yet. +* imagemagick6 has a security policy when it comes to pdf. it's not install and go. diff --git a/exp/gimmetxt/gimmetxt.go b/exp/gimmetxt/gimmetxt.go new file mode 100644 index 0000000..d1b9001 --- /dev/null +++ b/exp/gimmetxt/gimmetxt.go @@ -0,0 +1,64 @@ +package main + +import ( + "log" + + "gopkg.in/gographics/imagick.v2/imagick" +) + +func main() { + + if err := ConvertPdfToJpg("../files/vanwiehuurik.pdf", "out.jpeg"); err != nil { + log.Fatal(err) + } + + //client := gosseract.NewClient() + //defer client.Close() + + //client.SetImage(file.Name()) + //text, _ := client.Text() + //fmt.Println(text) +} + +func ConvertPdfToJpg(pdfName string, imageName string) error { + + // Setup + imagick.Initialize() + defer imagick.Terminate() + + mw := imagick.NewMagickWand() + defer mw.Destroy() + + // Must be *before* ReadImageFile + // Make sure our image is high quality + if err := mw.SetResolution(300, 300); err != nil { + return err + } + + // Load the image file into imagick + if err := mw.ReadImage(pdfName); err != nil { + return err + } + + // Must be *after* ReadImageFile + // Flatten image and remove alpha channel, to prevent alpha turning black in jpg + if err := mw.SetImageAlphaChannel(imagick.ALPHA_CHANNEL_FLATTEN); err != nil { + return err + } + + // Set any compression (100 = max quality) + if err := mw.SetCompressionQuality(95); err != nil { + return err + } + + // Select only first page of pdf + mw.SetIteratorIndex(0) + + // Convert into JPG + if err := mw.SetFormat("jpg"); err != nil { + return err + } + + // Save File + return mw.WriteImage(imageName) +} diff --git a/go.mod b/go.mod index e3f567e..38a324e 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,8 @@ module varia.zone/go-sh-manymanuals go 1.18 + +require ( + github.com/otiai10/gosseract/v2 v2.3.1 // indirect + gopkg.in/gographics/imagick.v2 v2.6.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..62fafba --- /dev/null +++ b/go.sum @@ -0,0 +1,11 @@ +github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE= +github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs= +github.com/otiai10/gosseract/v2 v2.3.1 h1:BFy9Rru7dzqEYX7/tJuEvjVPkkJck0f+b5fYzzr6/RM= +github.com/otiai10/gosseract/v2 v2.3.1/go.mod h1:2ZOGgdTIXQzCS5f+N1HkcXRgDX6K3ZoYe3Yvo++cpp4= +github.com/otiai10/mint v1.3.0/go.mod h1:F5AjcsTsWUqX+Na9fpHb52P8pcRX2CI6A3ctIT91xUo= +github.com/otiai10/mint v1.3.2 h1:VYWnrP5fXmz1MXvjuUvcBrXSjGE6xjON+axB/UrpO3E= +github.com/otiai10/mint v1.3.2/go.mod h1:/yxELlJQ0ufhjUwhshSj+wFjZ78CnZ48/1wtmBH1OTc= +gopkg.in/gographics/imagick.v2 v2.6.0 h1:ewRsUQk3QkjGumERlndbFn/kTYRjyMaPY5gxwpuAhik= +gopkg.in/gographics/imagick.v2 v2.6.0/go.mod h1:/QVPLV/iKdNttRKthmDkeeGg+vdHurVEPc8zkU0XgBk= +gopkg.in/gographics/imagick.v3 v3.4.0 h1:kSnbsXOWofo81VJEn/Hw8w3qqoOrfTyWwjAQzSdtPlg= +gopkg.in/gographics/imagick.v3 v3.4.0/go.mod h1:+Q9nyA2xRZXrDyTtJ/eko+8V/5E7bWYs08ndkZp8UmA=