Pedro Sá Couto
5 years ago
13 changed files with 38 additions and 63 deletions
Binary file not shown.
@ -1,43 +0,0 @@ |
|||||
#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/ |
|
||||
|
|
||||
import pdf2image |
|
||||
from PIL import Image |
|
||||
import time |
|
||||
|
|
||||
#DECLARE CONSTANTS |
|
||||
PDF_PATH = ("scans/out.pdf") |
|
||||
DPI = 72 |
|
||||
FIRST_PAGE = None |
|
||||
LAST_PAGE = None |
|
||||
FORMAT = 'jpg' |
|
||||
THREAD_COUNT = 1 |
|
||||
USERPWD = None |
|
||||
USE_CROPBOX = False |
|
||||
STRICT = False |
|
||||
|
|
||||
def pdftopil(): |
|
||||
#This method reads a pdf and converts it into a sequence of images |
|
||||
#PDF_PATH sets the path to the PDF file |
|
||||
#dpi parameter assists in adjusting the resolution of the image |
|
||||
#first_page parameter allows you to set a first page to be processed by pdftoppm |
|
||||
#last_page parameter allows you to set a last page to be processed by pdftoppm |
|
||||
#fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF) |
|
||||
#thread_count parameter allows you to set how many thread will be used for conversion. |
|
||||
#userpw parameter allows you to set a password to unlock the converted PDF |
|
||||
#use_cropbox parameter allows you to use the crop box instead of the media box when converting |
|
||||
#strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError |
|
||||
|
|
||||
start_time = time.time() |
|
||||
pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT) |
|
||||
print ("Time taken : " + str(time.time() - start_time)) |
|
||||
return pil_images |
|
||||
|
|
||||
def save_images(pil_images): |
|
||||
d = 0 |
|
||||
for image in pil_images: |
|
||||
image.save(("split/input%d"%d) + ".jpg") |
|
||||
d += 1 |
|
||||
|
|
||||
if __name__ == "__main__": |
|
||||
pil_images = pdftopil() |
|
||||
save_images(pil_images) |
|
@ -0,0 +1,7 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
i=0 |
||||
|
for img in `ls scans/*.jpg`; do |
||||
|
convert $img -density 72 split/input$i.jpg |
||||
|
i=$((i+1)); |
||||
|
done |
@ -0,0 +1,6 @@ |
|||||
|
rm -R scans split rotated bounding_box ocred |
||||
|
mkdir -p scans |
||||
|
mv out.pdf $(date +%F-%H:%M).pdf && touch out.pdf |
||||
|
sleep 2 |
||||
|
cp *.pdf ~/Desktop |
||||
|
rm *.pdf |
@ -1,7 +0,0 @@ |
|||||
#!/bin/bash |
|
||||
#line 3 means here |
|
||||
# cd "$(dirname "$0")" |
|
||||
|
|
||||
cd scans |
|
||||
pwd |
|
||||
convert *.jpg out.pdf |
|
@ -0,0 +1,7 @@ |
|||||
|
#!/bin/bash |
||||
|
|
||||
|
i=0 |
||||
|
for img in `ls scans/*.jpg`; do |
||||
|
mv $img scans/input$i.jpg |
||||
|
i=$((i+1)); |
||||
|
done |
Binary file not shown.
@ -1,13 +1,13 @@ |
|||||
./merge_scans.sh |
./rename_scans.sh |
||||
mkdir -p split |
mkdir -p split |
||||
python3 burstpdf.py |
./change_res.sh |
||||
|
./remove.sh |
||||
mkdir -p rotated |
mkdir -p rotated |
||||
python3 rotation.py |
python3 rotation.py |
||||
mkdir -p bounding_box |
mkdir -p bounding_box |
||||
python3 bounding_box.py |
python3 bounding_box.py |
||||
mkdir -p cropped |
# mkdir -p cropped |
||||
# python3 mirror_crop.py |
# python3 mirror_crop.py |
||||
mkdir -p ocred |
mkdir -p ocred |
||||
python3 tesseract_ocr.py |
python3 tesseract_ocr.py |
||||
./remove.sh |
|
||||
./merge_files.sh |
./merge_files.sh |
||||
|
Loading…
Reference in new issue