Pedro Sá Couto
5 years ago
commit
b078f1c75b
7 changed files with 186 additions and 0 deletions
@ -0,0 +1,43 @@ |
|||||
|
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ |
||||
|
|
||||
|
import pdf2image |
||||
|
from PIL import Image |
||||
|
import time |
||||
|
|
||||
|
#DECLARE CONSTANTS |
||||
|
PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): ")) |
||||
|
DPI = 200 |
||||
|
FIRST_PAGE = None |
||||
|
LAST_PAGE = None |
||||
|
FORMAT = 'jpg' |
||||
|
THREAD_COUNT = 1 |
||||
|
USERPWD = None |
||||
|
USE_CROPBOX = False |
||||
|
STRICT = False |
||||
|
|
||||
|
def pdftopil(): |
||||
|
#This method reads a pdf and converts it into a sequence of images |
||||
|
#PDF_PATH sets the path to the PDF file |
||||
|
#dpi parameter assists in adjusting the resolution of the image |
||||
|
#first_page parameter allows you to set a first page to be processed by pdftoppm |
||||
|
#last_page parameter allows you to set a last page to be processed by pdftoppm |
||||
|
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) |
||||
|
#thread_count parameter allows you to set how many thread will be used for conversion. |
||||
|
#userpw parameter allows you to set a password to unlock the converted PDF |
||||
|
#use_cropbox parameter allows you to use the crop box instead of the media box when converting |
||||
|
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError |
||||
|
|
||||
|
start_time = time.time() |
||||
|
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) |
||||
|
print ("Time taken : " + str(time.time() - start_time)) |
||||
|
return pil_images |
||||
|
|
||||
|
def save_images(pil_images): |
||||
|
d = 1 |
||||
|
for image in pil_images: |
||||
|
image.save(("split/input%d"%d) + ".jpg") |
||||
|
d += 1 |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
pil_images = pdftopil() |
||||
|
save_images(pil_images) |
@ -0,0 +1,3 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
sudo chmod 777 * |
@ -0,0 +1,67 @@ |
|||||
|
import cv2 |
||||
|
import time |
||||
|
import logging |
||||
|
|
||||
|
d = 1 |
||||
|
|
||||
|
while True: |
||||
|
try: |
||||
|
threshold = 25 |
||||
|
time.sleep(1) |
||||
|
|
||||
|
input = ('input%d.jpg'%d) |
||||
|
page = ('page%d.jpg'%d) |
||||
|
|
||||
|
print("Value of d is:",d,"\n","Page name:",input) |
||||
|
img = cv2.imread(input, 0) # load grayscale version |
||||
|
|
||||
|
# the indeces where the useful region starts and ends |
||||
|
hStrart = 0 |
||||
|
hEnd = img.shape[0] |
||||
|
vStart = 0 |
||||
|
vEnd = img.shape[1] |
||||
|
|
||||
|
# get row and column maxes for each row and column |
||||
|
hMax = img.max(1) |
||||
|
vMax = img.max(0) |
||||
|
|
||||
|
hDone_flag = False |
||||
|
vDone_flag = False |
||||
|
|
||||
|
# go through the list of max and begin where the pixel value is greater |
||||
|
# than the threshold |
||||
|
for i in range(hMax.size): |
||||
|
if not hDone_flag: |
||||
|
if hMax[i] > threshold: |
||||
|
hStart = i |
||||
|
hDone_flag = True |
||||
|
|
||||
|
if hDone_flag: |
||||
|
if hMax[i] < threshold: |
||||
|
hEnd = i |
||||
|
break |
||||
|
|
||||
|
for i in range(vMax.size): |
||||
|
if not vDone_flag: |
||||
|
if vMax[i] > threshold: |
||||
|
vStart = i |
||||
|
vDone_flag = True |
||||
|
|
||||
|
if vDone_flag: |
||||
|
if vMax[i] < threshold: |
||||
|
vEnd = i |
||||
|
break |
||||
|
|
||||
|
# load the color image and choose only the useful area from it |
||||
|
img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:] |
||||
|
|
||||
|
# write the cropped image |
||||
|
cv2.imwrite(page, img2) |
||||
|
|
||||
|
d+=1 |
||||
|
print("Value of d is:", d) |
||||
|
|
||||
|
except: |
||||
|
logging.exception("message") |
||||
|
print("All pages must be ready!") |
||||
|
break |
@ -0,0 +1,7 @@ |
|||||
|
#!/bin/bash |
||||
|
#line 3 means here |
||||
|
# cd "$(dirname "$0")" |
||||
|
|
||||
|
cd ocred |
||||
|
pwd |
||||
|
pdftk *.pdf cat output final.pdf |
@ -0,0 +1,36 @@ |
|||||
|
from PIL import Image |
||||
|
import time |
||||
|
|
||||
|
i = 1 |
||||
|
|
||||
|
while True: |
||||
|
|
||||
|
page = Image.open("split/input%i.jpg"%i) |
||||
|
|
||||
|
if i % 2 == 0: |
||||
|
#check where the for loop is |
||||
|
print("trying even") |
||||
|
|
||||
|
#rotate image by 90 degrees |
||||
|
angle = 90 |
||||
|
out = page.rotate(angle, expand=True) |
||||
|
out.save('rotated/input%i.jpg'%i) |
||||
|
print('This is an even page number') |
||||
|
|
||||
|
time.sleep(2) |
||||
|
print("variable i: ", i) |
||||
|
|
||||
|
else: |
||||
|
#check where the for loop is |
||||
|
print("trying odd") |
||||
|
|
||||
|
#rotate image by 90 degrees |
||||
|
angle = 270 |
||||
|
out = page.rotate(angle, expand=True) |
||||
|
out.save('rotated/input%i.jpg'%i) |
||||
|
print('This is an even page number') |
||||
|
|
||||
|
time.sleep(1) |
||||
|
print("variable i: ", i) |
||||
|
|
||||
|
i+=1 |
@ -0,0 +1,22 @@ |
|||||
|
# import libraries |
||||
|
from PIL import Image |
||||
|
import pytesseract |
||||
|
import time |
||||
|
|
||||
|
i = 1 |
||||
|
|
||||
|
while True: |
||||
|
try: |
||||
|
img = Image.open("split/page%i.jpg"%i) |
||||
|
print(img) |
||||
|
pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf') |
||||
|
time.sleep(1) |
||||
|
file = open(("ocred/page%i.pdf"%i), "w+b") |
||||
|
file.write(bytearray(pdf)) |
||||
|
file.close() |
||||
|
i+=1 |
||||
|
print(i) |
||||
|
|
||||
|
except: |
||||
|
print("All pages must be ready!") |
||||
|
break |
@ -0,0 +1,8 @@ |
|||||
|
mkdir split |
||||
|
mkdir rotated |
||||
|
mkdir ocred |
||||
|
python3 burstpdf.py |
||||
|
python3 rotation.py |
||||
|
python3 crop.py |
||||
|
python3 tesseract_ocr.py |
||||
|
./merge_files.sh |
Loading…
Reference in new issue