Browse Source

Still need to fix the mirror margins

master
Pedro Sá Couto 5 years ago
parent
commit
ae1a91eef7
  1. BIN
      .DS_Store
  2. 2
      burstpdf.py
  3. 67
      crop.py
  4. 34
      mask_crop.py
  5. 2
      merge_files.sh
  6. 7
      merge_scans.sh
  7. 2
      tesseract_ocr.py
  8. 4
      workshop_stream.sh

BIN
.DS_Store

Binary file not shown.

2
burstpdf.py

@ -5,7 +5,7 @@ from PIL import Image
import time
#DECLARE CONSTANTS
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
PDF_PATH = ("scans/out.pdf")
DPI = 200
FIRST_PAGE = None
LAST_PAGE = None

67
crop.py

@ -1,67 +0,0 @@
import cv2
import time
import logging
d = 1
while True:
try:
threshold = 25
time.sleep(1)
input = ('input%d.jpg'%d)
page = ('page%d.jpg'%d)
print("Value of d is:",d,"\n","Page name:",input)
img = cv2.imread(input, 0) # load grayscale version
# the indeces where the useful region starts and ends
hStrart = 0
hEnd = img.shape[0]
vStart = 0
vEnd = img.shape[1]
# get row and column maxes for each row and column
hMax = img.max(1)
vMax = img.max(0)
hDone_flag = False
vDone_flag = False
# go through the list of max and begin where the pixel value is greater
# than the threshold
for i in range(hMax.size):
if not hDone_flag:
if hMax[i] > threshold:
hStart = i
hDone_flag = True
if hDone_flag:
if hMax[i] < threshold:
hEnd = i
break
for i in range(vMax.size):
if not vDone_flag:
if vMax[i] > threshold:
vStart = i
vDone_flag = True
if vDone_flag:
if vMax[i] < threshold:
vEnd = i
break
# load the color image and choose only the useful area from it
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
# write the cropped image
cv2.imwrite(page, img2)
d+=1
print("Value of d is:", d)
except:
logging.exception("message")
print("All pages must be ready!")
break

34
mask_crop.py

@ -0,0 +1,34 @@
import cv2
import logging
d = 1
while True:
try:
output = ('cropped/page%d.jpg'%d)
# Load image, convert to grayscale, and find edges
image = cv2.imread('rotated/input%d.jpg'%d)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)[1]
# Find contour and sort by contour area
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
# Find bounding box and extract ROI
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
ROI = image[y:y+h, x:x+w]
break
cv2.imwrite(output,ROI)
cv2.waitKey()
d+=1
except:
logging.exception("message")
print("All pages must be ready!")
break

2
merge_files.sh

@ -4,4 +4,4 @@
cd ocred
pwd
pdftk *.pdf cat output final.pdf
pdfunite *.pdf out.pdf

7
merge_scans.sh

@ -0,0 +1,7 @@
#!/bin/bash
#line 3 means here
# cd "$(dirname "$0")"
cd scans
pwd
convert *.jpg out.pdf

2
tesseract_ocr.py

@ -7,7 +7,7 @@ i = 1
while True:
try:
img = Image.open("split/page%i.jpg"%i)
img = Image.open("cropped/page%i.jpg"%i)
print(img)
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
time.sleep(1)

4
workshop_stream.sh

@ -1,8 +1,10 @@
mkdir split
mkdir rotated
mkdir ocred
mkdir cropped
./merge_scans.sh
python3 burstpdf.py
python3 rotation.py
python3 crop.py
python3 mask_crop.py
python3 tesseract_ocr.py
./merge_files.sh

Loading…
Cancel
Save