needs test

5 years ago · 87b4535b83
10 changed files with 279 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/bounding_box.py
+++ b/bounding_box.py
@ -0,0 +1,34 @@
 import cv2
 import logging
 d = 1
 while True:
    try:
        output = ('cropped/page%d.jpg'%d)
        # Load image, convert to grayscale, and find edges
        image = cv2.imread('split/input%d.jpg'%d)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY)[1]
        # Find contour and sort by contour area
        cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[0] if len(cnts) == 2 else cnts[1]
        cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
        # Find bounding box and extract ROI
        for c in cnts:
            x,y,w,h = cv2.boundingRect(c)
            ROI = image[y:y+h, x:x+w]
            break
        cv2.imwrite(output,ROI)
        cv2.waitKey()
        d+=1
    except:
        logging.exception("message")
        print("All pages must be ready!")
        break
--- a/burstpdf.py
+++ b/burstpdf.py
@ -0,0 +1,43 @@
 #Based in the code in https://iq.opengenus.org/pdf_to_image_in_python/
 import pdf2image
 from PIL import Image
 import time
 #DECLARE CONSTANTS
 PDF_PATH = ("scans/out.pdf")
 DPI = 72
 FIRST_PAGE = None
 LAST_PAGE = None
 FORMAT = 'jpg'
 THREAD_COUNT = 1
 USERPWD = None
 USE_CROPBOX = False
 STRICT = False
 def pdftopil():
    #This method reads a pdf and converts it into a sequence of images
    #PDF_PATH sets the path to the PDF file
    #dpi parameter assists in adjusting the resolution of the image
    #first_page parameter allows you to set a first page to be processed by pdftoppm
    #last_page parameter allows you to set a last page to be processed by pdftoppm
    #fmt parameter allows to set the format of pdftoppm conversion (PpmImageFile, TIFF)
    #thread_count parameter allows you to set how many thread will be used for conversion.
    #userpw parameter allows you to set a password to unlock the converted PDF
    #use_cropbox parameter allows you to use the crop box instead of the media box when converting
    #strict parameter allows you to catch pdftoppm syntax error with a custom type PDFSyntaxError
    start_time = time.time()
    pil_images = pdf2image.convert_from_path(PDF_PATH, dpi=DPI, first_page=FIRST_PAGE, last_page=LAST_PAGE, fmt=FORMAT, thread_count=THREAD_COUNT, userpw=USERPWD, use_cropbox=USE_CROPBOX, strict=STRICT)
    print ("Time taken : " + str(time.time() - start_time))
    return pil_images
 def save_images(pil_images):
    d = 1
    for image in pil_images:
        image.save(("split/input%d"%d) + ".jpg")
        d += 1
 if __name__ == "__main__":
    pil_images = pdftopil()
    save_images(pil_images)
--- a/chmod.sh
+++ b/chmod.sh
@ -0,0 +1,3 @@
 #!/bin/bash
 sudo chmod 777 *
--- a/merge_files.sh
+++ b/merge_files.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 #line 3 means here
 # cd "$(dirname "$0")"
 cd ocred
 pwd
 pdfunite *.pdf ../out.pdf
--- a/merge_scans.sh
+++ b/merge_scans.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 #line 3 means here
 # cd "$(dirname "$0")"
 cd scans
 pwd
 convert *.jpg out.pdf
--- a/readme.md
+++ b/readme.md
@ -0,0 +1,155 @@
 <h1 align="center">Flatbed_Scanner_Workflow</h1>
 ## Getting started
 This set of scripts was written for the Text Laundrette workshop. The workshop takes place in the Publication Station, WDkA building.<br> Rotterdam, 03-02-2020<br>It is a workflow to turn the pictures from a Flatbed Scanner into a final OCRed PDF.<br>
 <br>
 ## About the Workshop
 <em>DESCRIPTION</em>
 <p>We will use a home-made, DIY book scanner, and open-source software to scan, process, and add digital features to printed texts brought by the participants to the workshop. Ultimately, we will include them in the “bootleg library”, a shadow library accessible over a local network.</p>
 <p>Shadow libraries operate outside of legal copyright frameworks, in response to decreased open access to knowledge. This workshop aims to extend our research on libraries, their sociability, and methods by which we can add provenance to texts included in public or private, legal or extra-legal collections.</p>
 <p>Participants should bring: a printed text, which they’d like to digitize and share.</p>
 <br><br>
 ##Dependencies
 ###Brew (MAC) or apt-get (LINUX)
 <p>You’ll need the command-line tools for Xcode installed.</p>
 ```bash
 xcode-select --install
 ```
 <p>After install Homebrew.</p>
 ```bash
 ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 ```
 <p>Run the following command once you’re done to ensure Homebrew is installed and working properly:</p>
 ```bash
 brew doctor
 ```
 ```bash
 sudo apt-get install python3 python3-pip imagemagick poppler pdfunite
 ```
 ```bash
 brew install python3 python3-pip imagemagick poppler pdfunite
 ```
 <br>
 ###PIP3
 ```bash
 sudo pip3 install pdf2image Pillow time logging opencv-python pytesseract
 ```
 <br>
 ##How to use
 <p>Your scans must look like this for the scripts to perform right.</p>
                               RIGHT PAGE
                         —————————————————————
                        |                     |
                        |——————————           |
                        |           |         |
                        |           |         |
                        |           |         |
                        |           |         |
                        |           |         |
                        |        01 |         |
                        |——————————           |
                        |                     |
                         —————————————————————
      LEFT PAGE                RIGHT PAGE
 —————————————————————   —————————————————————
 |                     | |                     |
 |           ——————————| |——————————           |
 |         |           | |           |         |
 |         |           | |           |         |
 |         |           | |           |         |
 |         |           | |           |         |
 |         |           | |           |         |
 |         | 02        | |        03 |         |
 |          —————————— | |——————————           |
 |                     | |                     |
 —————————————————————   —————————————————————
 <p>Add your pictures from the book scanner to the folder "/scans"</p>
 <p>Make all the files executable.</p>
 ```bash
 sudo chmod 777 merge_scans.sh workshop_stream.sh marge_files.sh
 ```
 <p>In case you want to skip any of the scripts just comment out in the shell code, <em>workshop_stream.sh</em>.</p>
 <p>Run ./workshop_stream.sh</p>
 <p>Wait :)</p>
 <br><br>
 ##Aditional information
 The workflow follows these scripts, by successive order:
 ###Create 5 directories
 ```bash
 mkdir split
 mkdir rotated
 mkdir ocred
 mkdir bounding_box
 mkdir cropped
 ```
 ###Merge the files in the directory <em>scans</em>
 <p>All the scans will be appended to one pdf called out.pdf</p>
 ```bash
 ./merge_scans.sh
 ```
 ###Burst the pdf in <em>scans</em>
 <p>Burst this pdf, renaming all the files so they can be iterated later.</p>
 ```bash
 python3 burstpdf.py
 ```
 ###Rotate the pdfs
 <p>The book scanner takes pictures of the pdfs, this scrip iterates through the odd and even pages rotating them to their original position.</p>
 ```bash
 python3 rotation.py
 ```
 ###Cropping the bounding boxes
 <p>The pages are now in their original position, but they have a bounding box. This script iterates through them and crops the highest contrast area found.</p>
 ```bash
 python3 bounding_box.py
 ```
 ###Cropping the mirror
 <p>The pages are now cropped, but the mirror is still visible in the middle.</p>
 ```bash
 python3 mirror_crop.py
 ```
 ###OCR
 <p>In this part we OCR the jpg, turning these into PDFs.</p>
 ```bash
 python3 tesseract_ocr.py
 ```
 ###Merge all the files and create the pdf
 <p>The OCRed pages are now joined into their final PDF, your book is ready :)</p>
 ```bash
 ./merge_files.sh
 ```
 <br><br>
 ## License
 The package is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
--- a/scans/.DS_Store
+++ b/scans/.DS_Store
--- a/tesseract_ocr.py
+++ b/tesseract_ocr.py
@ -0,0 +1,22 @@
 # import libraries
 from PIL import Image
 import pytesseract
 import time
 i = 1
 while True:
    try:
        img = Image.open("cropped/page%i.jpg"%i)
        print(img)
        pdf = pytesseract.image_to_pdf_or_hocr(img, lang="eng", extension='pdf')
        time.sleep(1)
        file = open(("ocred/page%i.pdf"%i), "w+b")
        file.write(bytearray(pdf))
        file.close()
        i+=1
        print(i)
    except:
        print("All pages must be ready!")
        break
--- a/workshop_stream.sh
+++ b/workshop_stream.sh
@ -0,0 +1,8 @@
 mkdir split
 mkdir ocred
 mkdir cropped
 ./merge_scans.sh
 python3 burstpdf.py
 python3 bounding_box.py
 python3 tesseract_ocr.py
 ./merge_files.sh