Pedro Sá Couto
5 years ago
13 changed files with 38 additions and 63 deletions
Binary file not shown.
@ -1,43 +0,0 @@ |
|||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ |
|||
|
|||
import pdf2image |
|||
from PIL import Image |
|||
import time |
|||
|
|||
#DECLARE CONSTANTS |
|||
PDF_PATH = ("scans/out.pdf") |
|||
DPI = 72 |
|||
FIRST_PAGE = None |
|||
LAST_PAGE = None |
|||
FORMAT = 'jpg' |
|||
THREAD_COUNT = 1 |
|||
USERPWD = None |
|||
USE_CROPBOX = False |
|||
STRICT = False |
|||
|
|||
def pdftopil(): |
|||
#This method reads a pdf and converts it into a sequence of images |
|||
#PDF_PATH sets the path to the PDF file |
|||
#dpi parameter assists in adjusting the resolution of the image |
|||
#first_page parameter allows you to set a first page to be processed by pdftoppm |
|||
#last_page parameter allows you to set a last page to be processed by pdftoppm |
|||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) |
|||
#thread_count parameter allows you to set how many thread will be used for conversion. |
|||
#userpw parameter allows you to set a password to unlock the converted PDF |
|||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting |
|||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError |
|||
|
|||
start_time = time.time() |
|||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) |
|||
print ("Time taken : " + str(time.time() - start_time)) |
|||
return pil_images |
|||
|
|||
def save_images(pil_images): |
|||
d = 0 |
|||
for image in pil_images: |
|||
image.save(("split/input%d"%d) + ".jpg") |
|||
d += 1 |
|||
|
|||
if __name__ == "__main__": |
|||
pil_images = pdftopil() |
|||
save_images(pil_images) |
@ -0,0 +1,7 @@ |
|||
#!/bin/bash |
|||
|
|||
i=0 |
|||
for img in `ls scans/*.jpg`; do |
|||
convert $img -density 72 split/input$i.jpg |
|||
i=$((i+1)); |
|||
done |
@ -0,0 +1,6 @@ |
|||
rm -R scans split rotated bounding_box ocred |
|||
mkdir -p scans |
|||
mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf |
|||
sleep 2 |
|||
cp *.pdf ~/Desktop |
|||
rm *.pdf |
@ -1,7 +0,0 @@ |
|||
#!/bin/bash |
|||
#line 3 means here |
|||
# cd "$(dirname "$0")" |
|||
|
|||
cd scans |
|||
pwd |
|||
convert *.jpg out.pdf |
@ -0,0 +1,7 @@ |
|||
#!/bin/bash |
|||
|
|||
i=0 |
|||
for img in `ls scans/*.jpg`; do |
|||
mv $img scans/input$i.jpg |
|||
i=$((i+1)); |
|||
done |
Binary file not shown.
@ -1,13 +1,13 @@ |
|||
./merge_scans.sh |
|||
./rename_scans.sh |
|||
mkdir -p split |
|||
python3 burstpdf.py |
|||
./change_res.sh |
|||
./remove.sh |
|||
mkdir -p rotated |
|||
python3 rotation.py |
|||
mkdir -p bounding_box |
|||
python3 bounding_box.py |
|||
mkdir -p cropped |
|||
# mkdir -p cropped |
|||
# python3 mirror_crop.py |
|||
mkdir -p ocred |
|||
python3 tesseract_ocr.py |
|||
./remove.sh |
|||
./merge_files.sh |
|||
|
Loading…
Reference in new issue