Still need to fix the mirror margins

5 years ago · ae1a91eef7
8 changed files with 47 additions and 71 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/burstpdf.py
+++ b/burstpdf.py
@ -5,7 +5,7 @@ from PIL import Image
 import time

 #DECLARE CONSTANTS
-PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
+PDF_PATH = ("scans/out.pdf")
 DPI = 200
 FIRST_PAGE = None
 LAST_PAGE = None
--- a/crop.py
+++ b/crop.py
@ -1,67 +0,0 @@
-import cv2
-import time
-import logging
-
-d = 1
-
-while True:
-    try:
-        threshold = 25
-        time.sleep(1)
-
-        input = ('input%d.jpg'%d)
-        page = ('page%d.jpg'%d)
-
-        print("Value of d is:",d,"\n","Page name:",input)
-        img = cv2.imread(input, 0) # load grayscale version
-
-        # the indeces where the useful region starts and ends
-        hStrart = 0
-        hEnd = img.shape[0]
-        vStart = 0
-        vEnd = img.shape[1]
-
-        # get row and column maxes for each row and column
-        hMax = img.max(1)
-        vMax = img.max(0)
-
-        hDone_flag = False
-        vDone_flag = False
-
-        # go through the list of max and begin where the pixel value is greater
-        # than the threshold
-        for i in range(hMax.size):
-            if not hDone_flag:
-                if hMax[i] > threshold:
-                    hStart = i
-                    hDone_flag = True
-
-            if hDone_flag:
-                if hMax[i] < threshold:
-                    hEnd = i
-                    break
-
-        for i in range(vMax.size):
-            if not vDone_flag:
-                if vMax[i] > threshold:
-                    vStart = i
-                    vDone_flag = True
-
-            if vDone_flag:
-                if vMax[i] < threshold:
-                    vEnd = i
-                    break
-
-        # load the color image and choose only the useful area from it
-        img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
-
-        # write the cropped image
-        cv2.imwrite(page, img2)
-
-        d+=1
-        print("Value of d is:", d)
-
-    except:
-        logging.exception("message")
-        print("All pages must be ready!")
-        break
--- a/mask_crop.py
+++ b/mask_crop.py
@ -0,0 +1,34 @@
+import cv2
+import logging
+
+d = 1
+
+while True:
+    try:
+        output = ('cropped/page%d.jpg'%d)
+
+        # Load image, convert to grayscale, and find edges
+        image = cv2.imread('rotated/input%d.jpg'%d)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)[1]
+
+        # Find contour and sort by contour area
+        cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
+
+        # Find bounding box and extract ROI
+        for c in cnts:
+            x,y,w,h = cv2.boundingRect(c)
+            ROI = image[y:y+h, x:x+w]
+            break
+
+        cv2.imwrite(output,ROI)
+        cv2.waitKey()
+
+        d+=1
+
+    except:
+        logging.exception("message")
+        print("All pages must be ready!")
+        break
--- a/merge_files.sh
+++ b/merge_files.sh
@ -4,4 +4,4 @@

 cd ocred
 pwd
-pdftk *.pdf cat output final.pdf
+pdfunite *.pdf out.pdf
--- a/merge_scans.sh
+++ b/merge_scans.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+#line 3 means here
+# cd "$(dirname "$0")"
+
+cd scans
+pwd
+convert *.jpg out.pdf
--- a/tesseract_ocr.py
+++ b/tesseract_ocr.py
@ -7,7 +7,7 @@ i = 1

 while True:
    try:
-        img = Image.open("split/page%i.jpg"%i)
+        img = Image.open("cropped/page%i.jpg"%i)
        print(img)
        pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
        time.sleep(1)
--- a/workshop_stream.sh
+++ b/workshop_stream.sh
@ -1,8 +1,10 @@
 mkdir split
 mkdir rotated
 mkdir ocred
+mkdir cropped
+./merge_scans.sh
 python3 burstpdf.py
 python3 rotation.py
-python3 crop.py
+python3 mask_crop.py
 python3 tesseract_ocr.py
 ./merge_files.sh