Pedro Sá Couto
5 years ago
commit
b078f1c75b
7 changed files with 186 additions and 0 deletions
@ -0,0 +1,43 @@ |
|||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ |
|||
|
|||
import pdf2image |
|||
from PIL import Image |
|||
import time |
|||
|
|||
#DECLARE CONSTANTS |
|||
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): ")) |
|||
DPI = 200 |
|||
FIRST_PAGE = None |
|||
LAST_PAGE = None |
|||
FORMAT = 'jpg' |
|||
THREAD_COUNT = 1 |
|||
USERPWD = None |
|||
USE_CROPBOX = False |
|||
STRICT = False |
|||
|
|||
def pdftopil(): |
|||
#This method reads a pdf and converts it into a sequence of images |
|||
#PDF_PATH sets the path to the PDF file |
|||
#dpi parameter assists in adjusting the resolution of the image |
|||
#first_page parameter allows you to set a first page to be processed by pdftoppm |
|||
#last_page parameter allows you to set a last page to be processed by pdftoppm |
|||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) |
|||
#thread_count parameter allows you to set how many thread will be used for conversion. |
|||
#userpw parameter allows you to set a password to unlock the converted PDF |
|||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting |
|||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError |
|||
|
|||
start_time = time.time() |
|||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) |
|||
print ("Time taken : " + str(time.time() - start_time)) |
|||
return pil_images |
|||
|
|||
def save_images(pil_images): |
|||
d = 1 |
|||
for image in pil_images: |
|||
image.save(("split/input%d"%d) + ".jpg") |
|||
d += 1 |
|||
|
|||
if __name__ == "__main__": |
|||
pil_images = pdftopil() |
|||
save_images(pil_images) |
@ -0,0 +1,3 @@ |
|||
#!/bin/bash |
|||
|
|||
sudo chmod 777 * |
@ -0,0 +1,67 @@ |
|||
import cv2 |
|||
import time |
|||
import logging |
|||
|
|||
d = 1 |
|||
|
|||
while True: |
|||
try: |
|||
threshold = 25 |
|||
time.sleep(1) |
|||
|
|||
input = ('input%d.jpg'%d) |
|||
page = ('page%d.jpg'%d) |
|||
|
|||
print("Value of d is:",d,"\n","Page name:",input) |
|||
img = cv2.imread(input, 0) # load grayscale version |
|||
|
|||
# the indeces where the useful region starts and ends |
|||
hStrart = 0 |
|||
hEnd = img.shape[0] |
|||
vStart = 0 |
|||
vEnd = img.shape[1] |
|||
|
|||
# get row and column maxes for each row and column |
|||
hMax = img.max(1) |
|||
vMax = img.max(0) |
|||
|
|||
hDone_flag = False |
|||
vDone_flag = False |
|||
|
|||
# go through the list of max and begin where the pixel value is greater |
|||
# than the threshold |
|||
for i in range(hMax.size): |
|||
if not hDone_flag: |
|||
if hMax[i] > threshold: |
|||
hStart = i |
|||
hDone_flag = True |
|||
|
|||
if hDone_flag: |
|||
if hMax[i] < threshold: |
|||
hEnd = i |
|||
break |
|||
|
|||
for i in range(vMax.size): |
|||
if not vDone_flag: |
|||
if vMax[i] > threshold: |
|||
vStart = i |
|||
vDone_flag = True |
|||
|
|||
if vDone_flag: |
|||
if vMax[i] < threshold: |
|||
vEnd = i |
|||
break |
|||
|
|||
# load the color image and choose only the useful area from it |
|||
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:] |
|||
|
|||
# write the cropped image |
|||
cv2.imwrite(page, img2) |
|||
|
|||
d+=1 |
|||
print("Value of d is:", d) |
|||
|
|||
except: |
|||
logging.exception("message") |
|||
print("All pages must be ready!") |
|||
break |
@ -0,0 +1,7 @@ |
|||
#!/bin/bash |
|||
#line 3 means here |
|||
# cd "$(dirname "$0")" |
|||
|
|||
cd ocred |
|||
pwd |
|||
pdftk *.pdf cat output final.pdf |
@ -0,0 +1,36 @@ |
|||
from PIL import Image |
|||
import time |
|||
|
|||
i = 1 |
|||
|
|||
while True: |
|||
|
|||
page = Image.open("split/input%i.jpg"%i) |
|||
|
|||
if i % 2 == 0: |
|||
#check where the for loop is |
|||
print("trying even") |
|||
|
|||
#rotate image by 90 degrees |
|||
angle = 90 |
|||
out = page.rotate(angle, expand=True) |
|||
out.save('rotated/input%i.jpg'%i) |
|||
print('This is an even page number') |
|||
|
|||
time.sleep(2) |
|||
print("variable i: ", i) |
|||
|
|||
else: |
|||
#check where the for loop is |
|||
print("trying odd") |
|||
|
|||
#rotate image by 90 degrees |
|||
angle = 270 |
|||
out = page.rotate(angle, expand=True) |
|||
out.save('rotated/input%i.jpg'%i) |
|||
print('This is an even page number') |
|||
|
|||
time.sleep(1) |
|||
print("variable i: ", i) |
|||
|
|||
i+=1 |
@ -0,0 +1,22 @@ |
|||
# import libraries |
|||
from PIL import Image |
|||
import pytesseract |
|||
import time |
|||
|
|||
i = 1 |
|||
|
|||
while True: |
|||
try: |
|||
img = Image.open("split/page%i.jpg"%i) |
|||
print(img) |
|||
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf') |
|||
time.sleep(1) |
|||
file = open(("ocred/page%i.pdf"%i), "w+b") |
|||
file.write(bytearray(pdf)) |
|||
file.close() |
|||
i+=1 |
|||
print(i) |
|||
|
|||
except: |
|||
print("All pages must be ready!") |
|||
break |
@ -0,0 +1,8 @@ |
|||
mkdir split |
|||
mkdir rotated |
|||
mkdir ocred |
|||
python3 burstpdf.py |
|||
python3 rotation.py |
|||
python3 crop.py |
|||
python3 tesseract_ocr.py |
|||
./merge_files.sh |
Loading…
Reference in new issue