From 27c24672c9b18144b1f6b5a3f1e0b544c4151a96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pedro=20S=C3=A1=20Couto?= Date: Sat, 1 Feb 2020 15:32:18 +0100 Subject: [PATCH] working --- .DS_Store | Bin 10244 -> 10244 bytes burstpdf.py | 2 +- mirror_crop.py | 4 ++-- remove.sh | 5 +++++ scans/.DS_Store | Bin 6148 -> 6148 bytes tesseract_ocr.py | 2 +- workshop_stream.sh | 13 +++++++------ 7 files changed, 16 insertions(+), 10 deletions(-) create mode 100755 remove.sh diff --git a/.DS_Store b/.DS_Store index d760ca61c8effdbb41484a58ecbfdcdea8c0e52f..a1888eb8b48f447874e158c419e57ec4664061aa 100644 GIT binary patch delta 467 zcmZn(XbG6$&nUeyU^hRb^kg0Z6DGzTlbr+%S)Cae7#2*f5YS>{_yuIzOkN@Ig^j_6 zfr07&)r#E=h!*+5n>Lo!1mLmm*OG86&%;Xv^spiCM=4vuC}RxdDl(p!JV~TML%h1$Qb)nupjJnr+R(t% zLPx>S(7>R!mXkwV)zH>6A-A%sx~8^n#$+2&dB&NWV??u=AOY4n`K-7d8)F2}JnPMZ V5)$m2*%f}VOjZ<;r@o!17y%1^aI^pb delta 302 zcmZn(XbG6$&nU4mU^hRb#AF@;6D9_Y$xZ@>tg9Fp7&IqW2xze}iUXM%lUE3QVPk9s zGY<$!POcZ^pL|O20~_OKkkIA>LK2K@Oo?D=0TH>$e}wti7~Fu|36npF7_c#&2g?MA zicPi=n@cPfacoZP1det~N0<)=@CBFsRj0s5UgRG|^EoGcc~L<>U}oHMI3i z$gQlZuBok?HCakbo^keOE3s@QHs;k}V^&Mpv9U;kn48~ANU%?AxVM>I;TMZ_Mt*s4 VW=d*OVi5-iCkJP|0EM=g0|2(pO~3#E diff --git a/burstpdf.py b/burstpdf.py index b7a22a9..32e0e9c 100755 --- a/burstpdf.py +++ b/burstpdf.py @@ -33,7 +33,7 @@ def pdftopil(): return pil_images def save_images(pil_images): - d = 1 + d = 0 for image in pil_images: image.save(("split/input%d"%d) + ".jpg") d += 1 diff --git a/mirror_crop.py b/mirror_crop.py index 6400a77..db56cc1 100644 --- a/mirror_crop.py +++ b/mirror_crop.py @@ -14,7 +14,7 @@ while True: print("cropping even") # left, up, right, bottom - border = (0, 0, 68, 0) + border = (0, 0, 65, 0) finalpage = ImageOps.crop(page, border) finalpage.save('cropped/page%i.jpg'%i) @@ -23,7 +23,7 @@ while True: print("cropping odd") # left, up, right, bottom - border = (68, 0, 0, 0) + border = (65, 0, 0, 0) finalpage = ImageOps.crop(page, border) finalpage.save('cropped/page%i.jpg'%i) diff --git a/remove.sh b/remove.sh new file mode 100755 index 0000000..1fb9b97 --- /dev/null +++ b/remove.sh @@ -0,0 +1,5 @@ +cd split +pwd +rm page0.jpg +rm -ltr | tail -1 +rm .DS_Store diff --git a/scans/.DS_Store b/scans/.DS_Store index 3ffcfeb523ffde24338b9368748e4d7e4ede62c0..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 100644 GIT binary patch delta 70 zcmZoMXfc=|#>AjHu~2NHo+1YW5HK<@2yA9#Vq@DZz_f~SGdl-A2T%b}swxpt2MIxIniFbmkKDDSFKV+Relc^=>X>yhkPM6&IQM4H`+tp3W-`eiyJRaFNCy5H1G-o}t(MqS z+^q-O<6T=&o>3^YYg8c6M?V2*@Ep0aK_4&FVOIm2Bcrh2+=2NcV1#6q3><-hZ`WLI AMgRZ+ diff --git a/tesseract_ocr.py b/tesseract_ocr.py index 2e91780..1e3f641 100755 --- a/tesseract_ocr.py +++ b/tesseract_ocr.py @@ -7,7 +7,7 @@ i = 1 while True: try: - img = Image.open("cropped/page%i.jpg"%i) + img = Image.open("bounding_box/input%i.jpg"%i) print(img) pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf') time.sleep(1) diff --git a/workshop_stream.sh b/workshop_stream.sh index 557800c..caa6ccc 100755 --- a/workshop_stream.sh +++ b/workshop_stream.sh @@ -1,12 +1,13 @@ -mkdir split -mkdir rotated -mkdir ocred -mkdir bounding_box -mkdir cropped ./merge_scans.sh +mkdir split python3 burstpdf.py +mkdir rotated python3 rotation.py +mkdir bounding_box python3 bounding_box.py -python3 mirror_crop.py +mkdir cropped +# python3 mirror_crop.py +mkdir ocred python3 tesseract_ocr.py +./remove.sh ./merge_files.sh