Browse Source

Needs to be tested

master
Pedro Sá Couto 5 years ago
commit
b078f1c75b
  1. 43
      burstpdf.py
  2. 3
      chmod.sh
  3. 67
      crop.py
  4. 7
      merge_files.sh
  5. 36
      rotation.py
  6. 22
      tesseract_ocr.py
  7. 8
      workshop_stream.sh

43
burstpdf.py

@ -0,0 +1,43 @@
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
import pdf2image
from PIL import Image
import time
#DECLARE CONSTANTS
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
DPI = 200
FIRST_PAGE = None
LAST_PAGE = None
FORMAT = 'jpg'
THREAD_COUNT = 1
USERPWD = None
USE_CROPBOX = False
STRICT = False
def pdftopil():
#This method reads a pdf and converts it into a sequence of images
#PDF_PATH sets the path to the PDF file
#dpi parameter assists in adjusting the resolution of the image
#first_page parameter allows you to set a first page to be processed by pdftoppm
#last_page parameter allows you to set a last page to be processed by pdftoppm
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
#thread_count parameter allows you to set how many thread will be used for conversion.
#userpw parameter allows you to set a password to unlock the converted PDF
#use_cropbox parameter allows you to use the crop box instead of the media box when converting
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
start_time = time.time()
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
print ("Time taken : " + str(time.time() - start_time))
return pil_images
def save_images(pil_images):
d = 1
for image in pil_images:
image.save(("split/input%d"%d) + ".jpg")
d += 1
if __name__ == "__main__":
pil_images = pdftopil()
save_images(pil_images)

3
chmod.sh

@ -0,0 +1,3 @@
#!/bin/bash
sudo chmod 777 *

67
crop.py

@ -0,0 +1,67 @@
import cv2
import time
import logging
d = 1
while True:
try:
threshold = 25
time.sleep(1)
input = ('input%d.jpg'%d)
page = ('page%d.jpg'%d)
print("Value of d is:",d,"\n","Page name:",input)
img = cv2.imread(input, 0) # load grayscale version
# the indeces where the useful region starts and ends
hStrart = 0
hEnd = img.shape[0]
vStart = 0
vEnd = img.shape[1]
# get row and column maxes for each row and column
hMax = img.max(1)
vMax = img.max(0)
hDone_flag = False
vDone_flag = False
# go through the list of max and begin where the pixel value is greater
# than the threshold
for i in range(hMax.size):
if not hDone_flag:
if hMax[i] > threshold:
hStart = i
hDone_flag = True
if hDone_flag:
if hMax[i] < threshold:
hEnd = i
break
for i in range(vMax.size):
if not vDone_flag:
if vMax[i] > threshold:
vStart = i
vDone_flag = True
if vDone_flag:
if vMax[i] < threshold:
vEnd = i
break
# load the color image and choose only the useful area from it
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
# write the cropped image
cv2.imwrite(page, img2)
d+=1
print("Value of d is:", d)
except:
logging.exception("message")
print("All pages must be ready!")
break

7
merge_files.sh

@ -0,0 +1,7 @@
#!/bin/bash
#line 3 means here
# cd "$(dirname "$0")"
cd ocred
pwd
pdftk *.pdf cat output final.pdf

36
rotation.py

@ -0,0 +1,36 @@
from PIL import Image
import time
i = 1
while True:
page = Image.open("split/input%i.jpg"%i)
if i % 2 == 0:
#check where the for loop is
print("trying even")
#rotate image by 90 degrees
angle = 90
out = page.rotate(angle, expand=True)
out.save('rotated/input%i.jpg'%i)
print('This is an even page number')
time.sleep(2)
print("variable i: ", i)
else:
#check where the for loop is
print("trying odd")
#rotate image by 90 degrees
angle = 270
out = page.rotate(angle, expand=True)
out.save('rotated/input%i.jpg'%i)
print('This is an even page number')
time.sleep(1)
print("variable i: ", i)
i+=1

22
tesseract_ocr.py

@ -0,0 +1,22 @@
# import libraries
from PIL import Image
import pytesseract
import time
i = 1
while True:
try:
img = Image.open("split/page%i.jpg"%i)
print(img)
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
time.sleep(1)
file = open(("ocred/page%i.pdf"%i), "w+b")
file.write(bytearray(pdf))
file.close()
i+=1
print(i)
except:
print("All pages must be ready!")
break

8
workshop_stream.sh

@ -0,0 +1,8 @@
mkdir split
mkdir rotated
mkdir ocred
python3 burstpdf.py
python3 rotation.py
python3 crop.py
python3 tesseract_ocr.py
./merge_files.sh
Loading…
Cancel
Save