Needs to be tested

5 years ago · b078f1c75b
7 changed files with 186 additions and 0 deletions
--- a/burstpdf.py
+++ b/burstpdf.py
@ -0,0 +1,43 @@
+#Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
+
+import pdf2image
+from PIL import Image
+import time
+
+#DECLARE CONSTANTS
+PDF_PATH = (input("What pdf do you want to use? (include extention as example.pdf): "))
+DPI = 200
+FIRST_PAGE = None
+LAST_PAGE = None
+FORMAT = 'jpg'
+THREAD_COUNT = 1
+USERPWD = None
+USE_CROPBOX = False
+STRICT = False
+
+def pdftopil():
+    #This method reads a pdf and converts it into a sequence of images
+    #PDF_PATH sets the path to the PDF file
+    #dpi parameter assists in adjusting the resolution of the image
+    #first_page parameter allows you to set a first page to be processed by pdftoppm
+    #last_page parameter allows you to set a last page to be processed by pdftoppm
+    #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
+    #thread_count parameter allows you to set how many thread will be used for conversion.
+    #userpw parameter allows you to set a password to unlock the converted PDF
+    #use_cropbox parameter allows you to use the crop box instead of the media box when converting
+    #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
+
+    start_time = time.time()
+    pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
+    print ("Time taken : " + str(time.time() - start_time))
+    return pil_images
+
+def save_images(pil_images):
+    d = 1
+    for image in pil_images:
+        image.save(("split/input%d"%d) + ".jpg")
+        d += 1
+
+if __name__ == "__main__":
+    pil_images = pdftopil()
+    save_images(pil_images)
--- a/chmod.sh
+++ b/chmod.sh
@ -0,0 +1,3 @@
+#!/bin/bash
+
+sudo chmod 777 *
--- a/crop.py
+++ b/crop.py
@ -0,0 +1,67 @@
+import cv2
+import time
+import logging
+
+d = 1
+
+while True:
+    try:
+        threshold = 25
+        time.sleep(1)
+
+        input = ('input%d.jpg'%d)
+        page = ('page%d.jpg'%d)
+
+        print("Value of d is:",d,"\n","Page name:",input)
+        img = cv2.imread(input, 0) # load grayscale version
+
+        # the indeces where the useful region starts and ends
+        hStrart = 0
+        hEnd = img.shape[0]
+        vStart = 0
+        vEnd = img.shape[1]
+
+        # get row and column maxes for each row and column
+        hMax = img.max(1)
+        vMax = img.max(0)
+
+        hDone_flag = False
+        vDone_flag = False
+
+        # go through the list of max and begin where the pixel value is greater
+        # than the threshold
+        for i in range(hMax.size):
+            if not hDone_flag:
+                if hMax[i] > threshold:
+                    hStart = i
+                    hDone_flag = True
+
+            if hDone_flag:
+                if hMax[i] < threshold:
+                    hEnd = i
+                    break
+
+        for i in range(vMax.size):
+            if not vDone_flag:
+                if vMax[i] > threshold:
+                    vStart = i
+                    vDone_flag = True
+
+            if vDone_flag:
+                if vMax[i] < threshold:
+                    vEnd = i
+                    break
+
+        # load the color image and choose only the useful area from it
+        img2 = (cv2.imread(input))[hStart:hEnd, vStart:vEnd,:]
+
+        # write the cropped image
+        cv2.imwrite(page, img2)
+
+        d+=1
+        print("Value of d is:", d)
+
+    except:
+        logging.exception("message")
+        print("All pages must be ready!")
+        break
--- a/merge_files.sh
+++ b/merge_files.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+#line 3 means here
+# cd "$(dirname "$0")"
+
+cd ocred
+pwd
+pdftk *.pdf cat output final.pdf
--- a/rotation.py
+++ b/rotation.py
@ -0,0 +1,36 @@
+from PIL import Image
+import time
+
+i = 1
+
+while True:
+
+    page = Image.open("split/input%i.jpg"%i)
+
+    if i % 2 == 0:
+        #check where the for loop is
+        print("trying even")
+
+        #rotate image by 90 degrees
+        angle = 90
+        out = page.rotate(angle, expand=True)
+        out.save('rotated/input%i.jpg'%i)
+        print('This is an even page number')
+
+        time.sleep(2)
+        print("variable i: ", i)
+
+    else:
+        #check where the for loop is
+        print("trying odd")
+
+        #rotate image by 90 degrees
+        angle = 270
+        out = page.rotate(angle, expand=True)
+        out.save('rotated/input%i.jpg'%i)
+        print('This is an even page number')
+
+        time.sleep(1)
+        print("variable i: ", i)
+
+    i+=1
--- a/tesseract_ocr.py
+++ b/tesseract_ocr.py
@ -0,0 +1,22 @@
+# import libraries
+from PIL import Image
+import pytesseract
+import time
+
+i = 1
+
+while True:
+    try:
+        img = Image.open("split/page%i.jpg"%i)
+        print(img)
+        pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
+        time.sleep(1)
+        file = open(("ocred/page%i.pdf"%i), "w+b")
+        file.write(bytearray(pdf))
+        file.close()
+        i+=1
+        print(i)
+
+    except:
+        print("All pages must be ready!")
+        break
--- a/workshop_stream.sh
+++ b/workshop_stream.sh
@ -0,0 +1,8 @@
+mkdir split
+mkdir rotated
+mkdir ocred
+python3 burstpdf.py
+python3 rotation.py
+python3 crop.py
+python3 tesseract_ocr.py
+./merge_files.sh