commit
4f016b718b
28 changed files with 2536 additions and 0 deletions
@ -0,0 +1,2 @@ |
|||
venv/ |
|||
static/pdf/ |
Binary file not shown.
Binary file not shown.
@ -0,0 +1,79 @@ |
|||
import os |
|||
import random |
|||
import shutil |
|||
import string |
|||
import subprocess |
|||
from pathlib import Path |
|||
from flask import Flask, flash, redirect, render_template, request, url_for |
|||
from hocrtransformpdf import * |
|||
from werkzeug.utils import secure_filename |
|||
from flask_basicauth import BasicAuth |
|||
import pdftotree |
|||
|
|||
UPLOAD_FOLDER = 'static/uploads' |
|||
ALLOWED_EXTENSIONS = {'pdf'} |
|||
|
|||
app = Flask(__name__) |
|||
|
|||
app.config['BASIC_AUTH_USERNAME'] = 'wordmord' |
|||
app.config['BASIC_AUTH_PASSWORD'] = 'tentacles' |
|||
|
|||
basic_auth = BasicAuth(app) |
|||
|
|||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER |
|||
|
|||
|
|||
@app.route('/', methods=['GET', 'POST']) |
|||
@basic_auth.required |
|||
def run_script(): |
|||
# the code below was made in case I was using a button upload but now I use the field input so this has to be uploaded and then transformed |
|||
if request.method == 'POST': |
|||
# check if the post request has the file part |
|||
if 'file' not in request.files: |
|||
flash('No file part') |
|||
return redirect(request.url) |
|||
file = request.files['file'] |
|||
# if user does not select file, browser also |
|||
# submit an empty part without filename |
|||
if file.filename == '': |
|||
flash('No selected file') |
|||
return redirect(request.url) |
|||
if file and allowed_file(file.filename): |
|||
filename = secure_filename(file.filename) |
|||
uploadfilepath=os.path.join(app.config['UPLOAD_FOLDER'], filename) |
|||
file.save(uploadfilepath) |
|||
# return redirect(url_for('uploaded_file', |
|||
# filename=filename)) |
|||
|
|||
hocr_result = pdftotree.parse(uploadfilepath) |
|||
app.logger.info("test") |
|||
hocr = HocrTransform(hocr_filename=hocr_result, dpi=300) |
|||
hocr.to_pdf( |
|||
out_filename='static/pdf/output-2.pdf', |
|||
image_filename='static/images/blank.png', |
|||
show_bounding_boxes=False, |
|||
interword_spaces=False, |
|||
) |
|||
|
|||
hocrfile='static/hocr/gynaikoktonia.hocr' |
|||
#hocr = HocrTransform(hocr_filename=hocrfile, dpi=300) |
|||
#hocr = HocrTransform(hocr_filename=hocr_result, dpi=300) |
|||
#hocr.to_pdf( |
|||
# out_filename='static/pdf/output.pdf', |
|||
# image_filename='static/images/blank.png', |
|||
# show_bounding_boxes=False, |
|||
# interword_spaces=False, |
|||
#) |
|||
# result = subprocess.check_output("python3 hocrtransformpdf.py -i images/blank.png hocr/gynaikoktonia.hocr pdf/gynaikoktonia.pdf", shell=True) |
|||
return render_template('results.html', **locals()) |
|||
|
|||
|
|||
|
|||
def allowed_file(filename): |
|||
return '.' in filename and \ |
|||
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
app.run() |
|||
|
@ -0,0 +1,518 @@ |
|||
#!/usr/bin/env python3 |
|||
# |
|||
# Copyright (c) 2010, Jonathan Brinley |
|||
# Original version from: https://github.com/jbrinley/HocrConverter |
|||
# |
|||
# Copyright (c) 2013-14, Julien Pfefferkorn |
|||
# Modifications |
|||
# |
|||
# Copyright (c) 2015-16, James R. Barlow |
|||
# Set text to transparent |
|||
|
|||
# Copyright (c) 2022, WordMord & Alex Roidl |
|||
# Set text back to visible and change bounding boxes |
|||
# |
|||
# Permission is hereby granted, free of charge, to any person obtaining a |
|||
# copy of this software and associated documentation files (the |
|||
# "Software"), to deal in the Software without restriction, including |
|||
# without limitation the rights to use, copy, modify, merge, publish, |
|||
# distribute, sublicense, and/or sell copies of the Software, and to |
|||
# permit persons to whom the Software is furnished to do so, subject to |
|||
# the following conditions: |
|||
# |
|||
# The above copyright notice and this permission notice shall be included |
|||
# in all copies or substantial portions of the Software. |
|||
# |
|||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|||
|
|||
import argparse |
|||
import os |
|||
import re |
|||
from itertools import chain |
|||
from math import atan, cos, sin |
|||
from pathlib import Path |
|||
from typing import Any, NamedTuple, Optional, Tuple, Union |
|||
from xml.etree import ElementTree |
|||
|
|||
from reportlab.lib.colors import black, cyan, magenta, red |
|||
from reportlab.lib.units import inch |
|||
from reportlab.pdfgen.canvas import Canvas |
|||
from reportlab.pdfbase import pdfmetrics |
|||
from reportlab.pdfbase.ttfonts import TTFont |
|||
from reportlab.pdfbase.pdfmetrics import registerFontFamily |
|||
|
|||
|
|||
|
|||
# According to Wikipedia these languages are supported in the ISO-8859-1 character |
|||
# set, meaning reportlab can generate them and they are compatible with hocr, |
|||
# assuming Tesseract has the necessary languages installed. Note that there may |
|||
# not be language packs for them. |
|||
HOCR_OK_LANGS = frozenset( |
|||
[ |
|||
# Languages fully covered by Latin-1: |
|||
'afr', # Afrikaans |
|||
'alb', # Albanian |
|||
'ast', # Leonese |
|||
'baq', # Basque |
|||
'bre', # Breton |
|||
'cos', # Corsican |
|||
'eng', # English |
|||
'eus', # Basque |
|||
'fao', # Faoese |
|||
'gla', # Scottish Gaelic |
|||
'glg', # Galician |
|||
'glv', # Manx |
|||
'ice', # Icelandic |
|||
'ind', # Indonesian |
|||
'isl', # Icelandic |
|||
'ita', # Italian |
|||
'ltz', # Luxembourgish |
|||
'mal', # Malay Rumi |
|||
'mga', # Irish |
|||
'nor', # Norwegian |
|||
'oci', # Occitan |
|||
'por', # Portugeuse |
|||
'roh', # Romansh |
|||
'sco', # Scots |
|||
'sma', # Sami |
|||
'spa', # Spanish |
|||
'sqi', # Albanian |
|||
'swa', # Swahili |
|||
'swe', # Swedish |
|||
'tgl', # Tagalog |
|||
'wln', # Walloon |
|||
# Languages supported by Latin-1 except for a few rare characters that OCR |
|||
# is probably not trained to recognize anyway: |
|||
'cat', # Catalan |
|||
'cym', # Welsh |
|||
'dan', # Danish |
|||
'deu', # German |
|||
'dut', # Dutch |
|||
'est', # Estonian |
|||
'fin', # Finnish |
|||
'fra', # French |
|||
'hun', # Hungarian |
|||
'kur', # Kurdish |
|||
'nld', # Dutch |
|||
'wel', # Welsh |
|||
] |
|||
) |
|||
|
|||
|
|||
Element = ElementTree.Element |
|||
|
|||
|
|||
class Rect(NamedTuple): # pylint: disable=inherit-non-class |
|||
"""A rectangle for managing PDF coordinates.""" |
|||
|
|||
x1: Any |
|||
y1: Any |
|||
x2: Any |
|||
y2: Any |
|||
|
|||
|
|||
class HocrTransformError(Exception): |
|||
pass |
|||
|
|||
|
|||
class HocrTransform: |
|||
|
|||
""" |
|||
A class for converting documents from the hOCR format. |
|||
For details of the hOCR format, see: |
|||
http://kba.cloud/hocr-spec/ |
|||
""" |
|||
|
|||
box_pattern = re.compile(r'bbox((\s+\d+){4})') |
|||
baseline_pattern = re.compile( |
|||
r''' |
|||
baseline \s+ |
|||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float |
|||
([\-\+]?\d+) # +/- int''', |
|||
re.VERBOSE, |
|||
) |
|||
ligatures = str.maketrans( |
|||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
|||
) |
|||
|
|||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): |
|||
self.dpi = dpi |
|||
self.hocr = ElementTree.parse(os.fspath(hocr_filename)) |
|||
|
|||
# if the hOCR file has a namespace, ElementTree requires its use to |
|||
# find elements |
|||
matches = re.match(r'({.*})html', self.hocr.getroot().tag) |
|||
self.xmlns = '' |
|||
if matches: |
|||
self.xmlns = matches.group(1) |
|||
|
|||
# get dimension in pt (not pixel!!!!) of the OCRed image |
|||
self.width, self.height = None, None |
|||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): |
|||
coords = self.element_coordinates(div) |
|||
pt_coords = self.pt_from_pixel(coords) |
|||
self.width = pt_coords.x2 - pt_coords.x1 |
|||
self.height = pt_coords.y2 - pt_coords.y1 |
|||
# there shouldn't be more than one, and if there is, we don't want |
|||
# it |
|||
break |
|||
if self.width is None or self.height is None: |
|||
raise HocrTransformError("hocr file is missing page dimensions") |
|||
|
|||
def __str__(self): # pragma: no cover |
|||
""" |
|||
Return the textual content of the HTML body |
|||
""" |
|||
if self.hocr is None: |
|||
return '' |
|||
body = self.hocr.find(self._child_xpath('body')) |
|||
if body: |
|||
return self._get_element_text(body) |
|||
else: |
|||
return '' |
|||
|
|||
def _get_element_text(self, element: Element): |
|||
""" |
|||
Return the textual content of the element and its children |
|||
""" |
|||
text = '' |
|||
if element.text is not None: |
|||
text += element.text |
|||
for child in element: |
|||
text += self._get_element_text(child) |
|||
if element.tail is not None: |
|||
text += element.tail |
|||
return text |
|||
|
|||
@classmethod |
|||
def element_coordinates(cls, element: Element) -> Rect: |
|||
""" |
|||
Returns a tuple containing the coordinates of the bounding box around |
|||
an element |
|||
""" |
|||
out = Rect._make(0 for _ in range(4)) |
|||
if 'title' in element.attrib: |
|||
matches = cls.box_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
coords = matches.group(1).split() |
|||
out = Rect._make(int(coords[n]) for n in range(4)) |
|||
return out |
|||
|
|||
@classmethod |
|||
def baseline(cls, element: Element) -> Tuple[float, float]: |
|||
""" |
|||
Returns a tuple containing the baseline slope and intercept. |
|||
""" |
|||
if 'title' in element.attrib: |
|||
matches = cls.baseline_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
return float(matches.group(1)), int(matches.group(2)) |
|||
return (0.0, 0.0) |
|||
|
|||
def pt_from_pixel(self, pxl) -> Rect: |
|||
""" |
|||
Returns the quantity in PDF units (pt) given quantity in pixels |
|||
""" |
|||
return Rect._make((c / self.dpi * inch) for c in pxl) |
|||
|
|||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: |
|||
xpath = f".//{self.xmlns}{html_tag}" |
|||
if html_class: |
|||
xpath += f"[@class='{html_class}']" |
|||
return xpath |
|||
|
|||
@classmethod |
|||
def replace_unsupported_chars(cls, s: str) -> str: |
|||
""" |
|||
Given an input string, returns the corresponding string that: |
|||
* is available in the Helvetica facetype |
|||
* does not contain any ligature (to allow easy search in the PDF file) |
|||
""" |
|||
return s.translate(cls.ligatures) |
|||
|
|||
def topdown_position(self, element): |
|||
pxl_line_coords = self.element_coordinates(element) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis |
|||
# is the top of the page and increasing values of y will move towards the |
|||
# bottom of the page. |
|||
return line_box.y2 |
|||
|
|||
def to_pdf( |
|||
self, |
|||
*, |
|||
out_filename: Path, |
|||
image_filename: Optional[Path] = None, |
|||
show_bounding_boxes: bool = False, |
|||
fontname: str = "Helvetica", |
|||
invisible_text: bool = False, |
|||
interword_spaces: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates a PDF file with an image superimposed on top of the text. |
|||
Text is positioned according to the bounding box of the lines in |
|||
the hOCR file. |
|||
The image need not be identical to the image used to create the hOCR |
|||
file. |
|||
It can have a lower resolution, different color mode, etc. |
|||
|
|||
Arguments: |
|||
out_filename: Path of PDF to write. |
|||
image_filename: Image to use for this file. If omitted, the OCR text |
|||
is shown. |
|||
show_bounding_boxes: Show bounding boxes around various text regions, |
|||
for debugging. |
|||
fontname: Name of font to use. |
|||
invisible_text: If True, text is rendered invisible so that is |
|||
selectable but never drawn. If False, text is visible and may |
|||
be seen if the image is skipped or deleted in Acrobat. |
|||
interword_spaces: If True, insert spaces between words rather than |
|||
drawing each word without spaces. Generally this improves text |
|||
extraction. |
|||
""" |
|||
# create the PDF file |
|||
# page size in points (1/72 in.) |
|||
|
|||
pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) |
|||
pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) |
|||
registerFontFamily('Greek', normal='Greek', bold='GreekB') |
|||
|
|||
pdf = Canvas( |
|||
os.fspath(out_filename), |
|||
pagesize=(self.width, self.height), |
|||
pageCompression=1, |
|||
) |
|||
|
|||
if image_filename is not None: |
|||
pdf.drawImage( |
|||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height |
|||
) |
|||
|
|||
# draw bounding box for each paragraph |
|||
# light blue for bounding box of paragraph |
|||
pdf.setStrokeColor(black) |
|||
# light blue for bounding box of paragraph |
|||
pdf.setFillColor(black) |
|||
pdf.setLineWidth(1) # no line for bounding box |
|||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): |
|||
elemtxt = self._get_element_text(elem).rstrip() |
|||
if len(elemtxt) == 0: |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
pt = self.pt_from_pixel(pxl_coords) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 |
|||
) |
|||
|
|||
found_lines = False |
|||
for line in sorted( |
|||
chain( |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), |
|||
), |
|||
key=self.topdown_position, |
|||
): |
|||
found_lines = True |
|||
self._do_line( |
|||
pdf, |
|||
line, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
|
|||
if not found_lines: |
|||
# Tesseract did not report any lines (just words) |
|||
root = self.hocr.find(self._child_xpath('div', 'ocr_page')) |
|||
self._do_line( |
|||
pdf, |
|||
root, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
# put the image on the page, scaled to fill the page |
|||
|
|||
|
|||
# finish up the page and save it |
|||
pdf.showPage() |
|||
pdf.save() |
|||
|
|||
@classmethod |
|||
def polyval(cls, poly, x): # pragma: no cover |
|||
return x * poly[0] + poly[1] |
|||
|
|||
def _do_line( |
|||
self, |
|||
pdf: Canvas, |
|||
line: Optional[Element], |
|||
elemclass: str, |
|||
fontname: str, |
|||
invisible_text: bool, |
|||
interword_spaces: bool, |
|||
show_bounding_boxes: bool, |
|||
): |
|||
if not line: |
|||
return |
|||
pxl_line_coords = self.element_coordinates(line) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
line_height = line_box.y2 - line_box.y1 |
|||
|
|||
slope, pxl_intercept = self.baseline(line) |
|||
if abs(slope) < 0.005: |
|||
slope = 0.0 |
|||
angle = atan(slope) |
|||
cos_a, sin_a = cos(angle), sin(angle) |
|||
|
|||
text = pdf.beginText() |
|||
intercept = pxl_intercept / self.dpi * inch |
|||
|
|||
# Don't allow the font to break out of the bounding box. Division by |
|||
# cos_a accounts for extra clearance between the glyph's vertical axis |
|||
# on a sloped baseline and the edge of the bounding box. |
|||
fontsize = (line_height - abs(intercept)) / cos_a * 1.2 |
|||
#fontsize = 10.5 |
|||
text.setFont('Greek', fontsize) |
|||
#if invisible_text: |
|||
# text.setTextRenderMode(3) # Invisible (indicates OCR text) |
|||
|
|||
# Intercept is normally negative, so this places it above the bottom |
|||
# of the line box |
|||
baseline_y2 = self.height - (line_box.y2 + intercept) |
|||
|
|||
if False: # pragma: no cover |
|||
# draw the baseline in magenta, dashed |
|||
pdf.setDash() |
|||
pdf.setStrokeColor(magenta) |
|||
pdf.setLineWidth(0.5) |
|||
# negate slope because it is defined as a rise/run in pixel |
|||
# coordinates and page coordinates have the y axis flipped |
|||
pdf.line( |
|||
line_box.x1, |
|||
baseline_y2, |
|||
line_box.x2, |
|||
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), |
|||
) |
|||
# light green for bounding box of word/line |
|||
pdf.setDash(6, 3) |
|||
pdf.setStrokeColor(red) |
|||
|
|||
#text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) |
|||
text.setTextOrigin(line_box.x1, baseline_y2) |
|||
##pdf.translate(line_box.x1, baseline_y2) |
|||
pdf.setFillColor(black) # text in black |
|||
|
|||
elements = line.findall(self._child_xpath('span', elemclass)) |
|||
for elem in elements: |
|||
elemtxt = self._get_element_text(elem).strip() |
|||
elemtxt = self.replace_unsupported_chars(elemtxt) |
|||
if elemtxt == '': |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
box = self.pt_from_pixel(pxl_coords) |
|||
if False: |
|||
# if `--interword-spaces` is true, append a space |
|||
# to the end of each text element to allow simpler PDF viewers |
|||
# such as PDF.js to better recognize words in search and copy |
|||
# and paste. Do not remove space from last word in line, even |
|||
# though it would look better, because it will interfere with |
|||
# naive text extraction. \n does not work either. |
|||
elemtxt += ' ' |
|||
box = Rect._make( |
|||
( |
|||
box.x1, |
|||
line_box.y1, |
|||
box.x2 + pdf.stringWidth(' ', fontname, line_height), |
|||
line_box.y2, |
|||
) |
|||
) |
|||
box_width = box.x2 - box.x1 |
|||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
|||
|
|||
# draw the bbox border |
|||
if False: # pragma: no cover |
|||
pdf.rect( |
|||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0 |
|||
) |
|||
|
|||
# Adjust relative position of cursor |
|||
# This is equivalent to: |
|||
# text.setTextOrigin(pt.x1, self.height - line_box.y2) |
|||
# but the former generates a full text reposition matrix (Tm) in the |
|||
# content stream while this issues a "offset" (Td) command. |
|||
# .moveCursor() is relative to start of the text line, where the |
|||
# "text line" means whatever reportlab defines it as. Do not use |
|||
# use .getCursor(), since moveCursor() rather unintuitively plans |
|||
# its moves relative to .getStartOfLine(). |
|||
# For skewed lines, in the text transform we set up a rotated |
|||
# coordinate system, so we don't have to account for the |
|||
# incremental offset. Surprisingly most PDF viewers can handle this. |
|||
cursor = text.getStartOfLine() |
|||
dx = box.x1 - cursor[0] |
|||
dy = baseline_y2 - cursor[1] |
|||
text.moveCursor(dx, dy) |
|||
|
|||
# If reportlab tells us this word is 0 units wide, our best seems |
|||
# to be to suppress this text |
|||
if font_width > 0: |
|||
#text.setHorizScale(100 * box_width / font_width) |
|||
text.textOut(elemtxt) |
|||
pdf.drawText(text) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
parser = argparse.ArgumentParser(description='Convert hocr file to PDF') |
|||
parser.add_argument( |
|||
'-b', |
|||
'--boundingboxes', |
|||
action="store_true", |
|||
default=False, |
|||
help='Show bounding boxes borders', |
|||
) |
|||
parser.add_argument( |
|||
'-r', |
|||
'--resolution', |
|||
type=int, |
|||
default=300, |
|||
help='Resolution of the image that was OCRed', |
|||
) |
|||
parser.add_argument( |
|||
'-i', |
|||
'--image', |
|||
default=None, |
|||
help='Path to the image to be placed above the text', |
|||
) |
|||
parser.add_argument( |
|||
'--interword-spaces', |
|||
action='store_true', |
|||
default=False, |
|||
help='Add spaces between words', |
|||
) |
|||
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') |
|||
parser.add_argument('outputfile', help='Path to the PDF file to be generated') |
|||
args = parser.parse_args() |
|||
|
|||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) |
|||
hocr.to_pdf( |
|||
out_filename=args.outputfile, |
|||
image_filename=args.image, |
|||
show_bounding_boxes=args.boundingboxes, |
|||
interword_spaces=args.interword_spaces, |
|||
) |
|||
|
@ -0,0 +1,511 @@ |
|||
#!/usr/bin/env python3 |
|||
# |
|||
# Copyright (c) 2010, Jonathan Brinley |
|||
# Original version from: https://github.com/jbrinley/HocrConverter |
|||
# |
|||
# Copyright (c) 2013-14, Julien Pfefferkorn |
|||
# Modifications |
|||
# |
|||
# Copyright (c) 2015-16, James R. Barlow |
|||
# Set text to transparent |
|||
# |
|||
# Permission is hereby granted, free of charge, to any person obtaining a |
|||
# copy of this software and associated documentation files (the |
|||
# "Software"), to deal in the Software without restriction, including |
|||
# without limitation the rights to use, copy, modify, merge, publish, |
|||
# distribute, sublicense, and/or sell copies of the Software, and to |
|||
# permit persons to whom the Software is furnished to do so, subject to |
|||
# the following conditions: |
|||
# |
|||
# The above copyright notice and this permission notice shall be included |
|||
# in all copies or substantial portions of the Software. |
|||
# |
|||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|||
|
|||
import argparse |
|||
import os |
|||
import re |
|||
from itertools import chain |
|||
from math import atan, cos, sin |
|||
from pathlib import Path |
|||
from typing import Any, NamedTuple, Optional, Tuple, Union |
|||
from xml.etree import ElementTree |
|||
|
|||
from reportlab.lib.colors import black, cyan, magenta, red |
|||
from reportlab.lib.units import inch |
|||
from reportlab.pdfgen.canvas import Canvas |
|||
from reportlab.pdfbase import pdfmetrics |
|||
from reportlab.pdfbase.ttfonts import TTFont |
|||
from reportlab.pdfbase.pdfmetrics import registerFontFamily |
|||
|
|||
|
|||
|
|||
# According to Wikipedia these languages are supported in the ISO-8859-1 character |
|||
# set, meaning reportlab can generate them and they are compatible with hocr, |
|||
# assuming Tesseract has the necessary languages installed. Note that there may |
|||
# not be language packs for them. |
|||
HOCR_OK_LANGS = frozenset( |
|||
[ |
|||
# Languages fully covered by Latin-1: |
|||
'afr', # Afrikaans |
|||
'alb', # Albanian |
|||
'ast', # Leonese |
|||
'baq', # Basque |
|||
'bre', # Breton |
|||
'cos', # Corsican |
|||
'eng', # English |
|||
'eus', # Basque |
|||
'fao', # Faoese |
|||
'gla', # Scottish Gaelic |
|||
'glg', # Galician |
|||
'glv', # Manx |
|||
'ice', # Icelandic |
|||
'ind', # Indonesian |
|||
'isl', # Icelandic |
|||
'ita', # Italian |
|||
'ltz', # Luxembourgish |
|||
'mal', # Malay Rumi |
|||
'mga', # Irish |
|||
'nor', # Norwegian |
|||
'oci', # Occitan |
|||
'por', # Portugeuse |
|||
'roh', # Romansh |
|||
'sco', # Scots |
|||
'sma', # Sami |
|||
'spa', # Spanish |
|||
'sqi', # Albanian |
|||
'swa', # Swahili |
|||
'swe', # Swedish |
|||
'tgl', # Tagalog |
|||
'wln', # Walloon |
|||
# Languages supported by Latin-1 except for a few rare characters that OCR |
|||
# is probably not trained to recognize anyway: |
|||
'cat', # Catalan |
|||
'cym', # Welsh |
|||
'dan', # Danish |
|||
'deu', # German |
|||
'dut', # Dutch |
|||
'est', # Estonian |
|||
'fin', # Finnish |
|||
'fra', # French |
|||
'hun', # Hungarian |
|||
'kur', # Kurdish |
|||
'nld', # Dutch |
|||
'wel', # Welsh |
|||
] |
|||
) |
|||
|
|||
|
|||
Element = ElementTree.Element |
|||
|
|||
|
|||
class Rect(NamedTuple): # pylint: disable=inherit-non-class |
|||
"""A rectangle for managing PDF coordinates.""" |
|||
|
|||
x1: Any |
|||
y1: Any |
|||
x2: Any |
|||
y2: Any |
|||
|
|||
|
|||
class HocrTransformError(Exception): |
|||
pass |
|||
|
|||
|
|||
class HocrTransform: |
|||
|
|||
""" |
|||
A class for converting documents from the hOCR format. |
|||
For details of the hOCR format, see: |
|||
http://kba.cloud/hocr-spec/ |
|||
""" |
|||
|
|||
box_pattern = re.compile(r'bbox((\s+\d+){4})') |
|||
baseline_pattern = re.compile( |
|||
r''' |
|||
baseline \s+ |
|||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float |
|||
([\-\+]?\d+) # +/- int''', |
|||
re.VERBOSE, |
|||
) |
|||
ligatures = str.maketrans( |
|||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
|||
) |
|||
|
|||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): |
|||
self.dpi = dpi |
|||
self.hocr = ElementTree.parse(os.fspath(hocr_filename)) |
|||
|
|||
# if the hOCR file has a namespace, ElementTree requires its use to |
|||
# find elements |
|||
matches = re.match(r'({.*})html', self.hocr.getroot().tag) |
|||
self.xmlns = '' |
|||
if matches: |
|||
self.xmlns = matches.group(1) |
|||
|
|||
# get dimension in pt (not pixel!!!!) of the OCRed image |
|||
self.width, self.height = None, None |
|||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): |
|||
coords = self.element_coordinates(div) |
|||
pt_coords = self.pt_from_pixel(coords) |
|||
self.width = pt_coords.x2 - pt_coords.x1 |
|||
self.height = pt_coords.y2 - pt_coords.y1 |
|||
# there shouldn't be more than one, and if there is, we don't want |
|||
# it |
|||
break |
|||
if self.width is None or self.height is None: |
|||
raise HocrTransformError("hocr file is missing page dimensions") |
|||
|
|||
def __str__(self): # pragma: no cover |
|||
""" |
|||
Return the textual content of the HTML body |
|||
""" |
|||
if self.hocr is None: |
|||
return '' |
|||
body = self.hocr.find(self._child_xpath('body')) |
|||
if body: |
|||
return self._get_element_text(body) |
|||
else: |
|||
return '' |
|||
|
|||
def _get_element_text(self, element: Element): |
|||
""" |
|||
Return the textual content of the element and its children |
|||
""" |
|||
text = '' |
|||
if element.text is not None: |
|||
text += element.text |
|||
for child in element: |
|||
text += self._get_element_text(child) |
|||
if element.tail is not None: |
|||
text += element.tail |
|||
return text |
|||
|
|||
@classmethod |
|||
def element_coordinates(cls, element: Element) -> Rect: |
|||
""" |
|||
Returns a tuple containing the coordinates of the bounding box around |
|||
an element |
|||
""" |
|||
out = Rect._make(0 for _ in range(4)) |
|||
if 'title' in element.attrib: |
|||
matches = cls.box_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
coords = matches.group(1).split() |
|||
out = Rect._make(int(coords[n]) for n in range(4)) |
|||
return out |
|||
|
|||
@classmethod |
|||
def baseline(cls, element: Element) -> Tuple[float, float]: |
|||
""" |
|||
Returns a tuple containing the baseline slope and intercept. |
|||
""" |
|||
if 'title' in element.attrib: |
|||
matches = cls.baseline_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
return float(matches.group(1)), int(matches.group(2)) |
|||
return (0.0, 0.0) |
|||
|
|||
def pt_from_pixel(self, pxl) -> Rect: |
|||
""" |
|||
Returns the quantity in PDF units (pt) given quantity in pixels |
|||
""" |
|||
return Rect._make((c / self.dpi * inch) for c in pxl) |
|||
|
|||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: |
|||
xpath = f".//{self.xmlns}{html_tag}" |
|||
if html_class: |
|||
xpath += f"[@class='{html_class}']" |
|||
return xpath |
|||
|
|||
@classmethod |
|||
def replace_unsupported_chars(cls, s: str) -> str: |
|||
""" |
|||
Given an input string, returns the corresponding string that: |
|||
* is available in the Helvetica facetype |
|||
* does not contain any ligature (to allow easy search in the PDF file) |
|||
""" |
|||
return s.translate(cls.ligatures) |
|||
|
|||
def topdown_position(self, element): |
|||
pxl_line_coords = self.element_coordinates(element) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis |
|||
# is the top of the page and increasing values of y will move towards the |
|||
# bottom of the page. |
|||
return line_box.y2 |
|||
|
|||
def to_pdf( |
|||
self, |
|||
*, |
|||
out_filename: Path, |
|||
image_filename: Optional[Path] = None, |
|||
show_bounding_boxes: bool = False, |
|||
fontname: str = "Helvetica", |
|||
invisible_text: bool = False, |
|||
interword_spaces: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates a PDF file with an image superimposed on top of the text. |
|||
Text is positioned according to the bounding box of the lines in |
|||
the hOCR file. |
|||
The image need not be identical to the image used to create the hOCR |
|||
file. |
|||
It can have a lower resolution, different color mode, etc. |
|||
|
|||
Arguments: |
|||
out_filename: Path of PDF to write. |
|||
image_filename: Image to use for this file. If omitted, the OCR text |
|||
is shown. |
|||
show_bounding_boxes: Show bounding boxes around various text regions, |
|||
for debugging. |
|||
fontname: Name of font to use. |
|||
invisible_text: If True, text is rendered invisible so that is |
|||
selectable but never drawn. If False, text is visible and may |
|||
be seen if the image is skipped or deleted in Acrobat. |
|||
interword_spaces: If True, insert spaces between words rather than |
|||
drawing each word without spaces. Generally this improves text |
|||
extraction. |
|||
""" |
|||
# create the PDF file |
|||
# page size in points (1/72 in.) |
|||
pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) |
|||
pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) |
|||
registerFontFamily('Greek', normal='Greek', bold='GreekB') |
|||
|
|||
pdf = Canvas( |
|||
os.fspath(out_filename), |
|||
pagesize=(self.width, self.height), |
|||
pageCompression=1, |
|||
) |
|||
|
|||
# draw bounding box for each paragraph |
|||
# light blue for bounding box of paragraph |
|||
pdf.setStrokeColor(cyan) |
|||
# light blue for bounding box of paragraph |
|||
pdf.setFillColor(cyan) |
|||
pdf.setLineWidth(0) # no line for bounding box |
|||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): |
|||
elemtxt = self._get_element_text(elem).rstrip() |
|||
if len(elemtxt) == 0: |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
pt = self.pt_from_pixel(pxl_coords) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 |
|||
) |
|||
|
|||
found_lines = False |
|||
for line in sorted( |
|||
chain( |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), |
|||
), |
|||
key=self.topdown_position, |
|||
): |
|||
found_lines = True |
|||
self._do_line( |
|||
pdf, |
|||
line, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
|
|||
if not found_lines: |
|||
# Tesseract did not report any lines (just words) |
|||
root = self.hocr.find(self._child_xpath('div', 'ocr_page')) |
|||
self._do_line( |
|||
pdf, |
|||
root, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
# put the image on the page, scaled to fill the page |
|||
if image_filename is not None: |
|||
pdf.drawImage( |
|||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height |
|||
) |
|||
|
|||
# finish up the page and save it |
|||
pdf.showPage() |
|||
pdf.save() |
|||
|
|||
@classmethod |
|||
def polyval(cls, poly, x): # pragma: no cover |
|||
return x * poly[0] + poly[1] |
|||
|
|||
def _do_line( |
|||
self, |
|||
pdf: Canvas, |
|||
line: Optional[Element], |
|||
elemclass: str, |
|||
fontname: str, |
|||
invisible_text: bool, |
|||
interword_spaces: bool, |
|||
show_bounding_boxes: bool, |
|||
): |
|||
if not line: |
|||
return |
|||
pxl_line_coords = self.element_coordinates(line) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
line_height = line_box.y2 - line_box.y1 |
|||
|
|||
slope, pxl_intercept = self.baseline(line) |
|||
if abs(slope) < 0.005: |
|||
slope = 0.0 |
|||
angle = atan(slope) |
|||
cos_a, sin_a = cos(angle), sin(angle) |
|||
|
|||
text = pdf.beginText() |
|||
intercept = pxl_intercept / self.dpi * inch |
|||
|
|||
# Don't allow the font to break out of the bounding box. Division by |
|||
# cos_a accounts for extra clearance between the glyph's vertical axis |
|||
# on a sloped baseline and the edge of the bounding box. |
|||
fontsize = (line_height - abs(intercept)) / cos_a * 1.2 |
|||
#fontsize = 10.5 |
|||
text.setFont('Greek', fontsize) |
|||
if invisible_text: |
|||
text.setTextRenderMode(3) # Invisible (indicates OCR text) |
|||
|
|||
# Intercept is normally negative, so this places it above the bottom |
|||
# of the line box |
|||
baseline_y2 = self.height - (line_box.y2 + intercept) |
|||
|
|||
if False: # pragma: no cover |
|||
# draw the baseline in magenta, dashed |
|||
pdf.setDash() |
|||
pdf.setStrokeColor(magenta) |
|||
pdf.setLineWidth(0.5) |
|||
# negate slope because it is defined as a rise/run in pixel |
|||
# coordinates and page coordinates have the y axis flipped |
|||
pdf.line( |
|||
line_box.x1, |
|||
baseline_y2, |
|||
line_box.x2, |
|||
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), |
|||
) |
|||
# light green for bounding box of word/line |
|||
pdf.setDash(6, 3) |
|||
pdf.setStrokeColor(red) |
|||
|
|||
#text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) |
|||
text.setTextOrigin(line_box.x1, baseline_y2) |
|||
pdf.setFillColor(black) # text in black |
|||
|
|||
elements = line.findall(self._child_xpath('span', elemclass)) |
|||
for elem in elements: |
|||
elemtxt = self._get_element_text(elem).strip() |
|||
elemtxt = self.replace_unsupported_chars(elemtxt) |
|||
if elemtxt == '': |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
box = self.pt_from_pixel(pxl_coords) |
|||
if interword_spaces: |
|||
# if `--interword-spaces` is true, append a space |
|||
# to the end of each text element to allow simpler PDF viewers |
|||
# such as PDF.js to better recognize words in search and copy |
|||
# and paste. Do not remove space from last word in line, even |
|||
# though it would look better, because it will interfere with |
|||
# naive text extraction. \n does not work either. |
|||
elemtxt += ' ' |
|||
box = Rect._make( |
|||
( |
|||
box.x1, |
|||
line_box.y1, |
|||
box.x2 + pdf.stringWidth(' ', fontname, line_height), |
|||
line_box.y2, |
|||
) |
|||
) |
|||
box_width = box.x2 - box.x1 |
|||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
|||
|
|||
# draw the bbox border |
|||
if False: # pragma: no cover |
|||
pdf.rect( |
|||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0 |
|||
) |
|||
|
|||
# Adjust relative position of cursor |
|||
# This is equivalent to: |
|||
# text.setTextOrigin(pt.x1, self.height - line_box.y2) |
|||
# but the former generates a full text reposition matrix (Tm) in the |
|||
# content stream while this issues a "offset" (Td) command. |
|||
# .moveCursor() is relative to start of the text line, where the |
|||
# "text line" means whatever reportlab defines it as. Do not use |
|||
# use .getCursor(), since moveCursor() rather unintuitively plans |
|||
# its moves relative to .getStartOfLine(). |
|||
# For skewed lines, in the text transform we set up a rotated |
|||
# coordinate system, so we don't have to account for the |
|||
# incremental offset. Surprisingly most PDF viewers can handle this. |
|||
cursor = text.getStartOfLine() |
|||
dx = box.x1 - cursor[0] |
|||
dy = baseline_y2 - cursor[1] |
|||
text.moveCursor(dx, dy) |
|||
|
|||
# If reportlab tells us this word is 0 units wide, our best seems |
|||
# to be to suppress this text |
|||
if font_width > 0: |
|||
#text.setHorizScale(100 * box_width / font_width) |
|||
text.textOut(elemtxt) |
|||
pdf.drawText(text) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
parser = argparse.ArgumentParser(description='Convert hocr file to PDF') |
|||
parser.add_argument( |
|||
'-b', |
|||
'--boundingboxes', |
|||
action="store_true", |
|||
default=False, |
|||
help='Show bounding boxes borders', |
|||
) |
|||
parser.add_argument( |
|||
'-r', |
|||
'--resolution', |
|||
type=int, |
|||
default=300, |
|||
help='Resolution of the image that was OCRed', |
|||
) |
|||
parser.add_argument( |
|||
'-i', |
|||
'--image', |
|||
default=None, |
|||
help='Path to the image to be placed above the text', |
|||
) |
|||
parser.add_argument( |
|||
'--interword-spaces', |
|||
action='store_true', |
|||
default=False, |
|||
help='Add spaces between words', |
|||
) |
|||
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') |
|||
parser.add_argument('outputfile', help='Path to the PDF file to be generated') |
|||
args = parser.parse_args() |
|||
|
|||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) |
|||
hocr.to_pdf( |
|||
out_filename=args.outputfile, |
|||
image_filename=args.image, |
|||
show_bounding_boxes=args.boundingboxes, |
|||
interword_spaces=args.interword_spaces, |
|||
) |
|||
|
@ -0,0 +1,518 @@ |
|||
#!venv/bin python3 |
|||
# |
|||
# Copyright (c) 2010, Jonathan Brinley |
|||
# Original version from: https://github.com/jbrinley/HocrConverter |
|||
# |
|||
# Copyright (c) 2013-14, Julien Pfefferkorn |
|||
# Modifications |
|||
# |
|||
# Copyright (c) 2015-16, James R. Barlow |
|||
# Set text to transparent |
|||
|
|||
# Copyright (c) 2022, WordMord & Alex Roidl |
|||
# Set text back to visible and change bounding boxes |
|||
# |
|||
# Permission is hereby granted, free of charge, to any person obtaining a |
|||
# copy of this software and associated documentation files (the |
|||
# "Software"), to deal in the Software without restriction, including |
|||
# without limitation the rights to use, copy, modify, merge, publish, |
|||
# distribute, sublicense, and/or sell copies of the Software, and to |
|||
# permit persons to whom the Software is furnished to do so, subject to |
|||
# the following conditions: |
|||
# |
|||
# The above copyright notice and this permission notice shall be included |
|||
# in all copies or substantial portions of the Software. |
|||
# |
|||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|||
|
|||
import argparse |
|||
import os |
|||
import re |
|||
from itertools import chain |
|||
from math import atan, cos, sin |
|||
from pathlib import Path |
|||
from typing import Any, NamedTuple, Optional, Tuple, Union |
|||
from xml.etree import ElementTree |
|||
|
|||
from reportlab.lib.colors import black, cyan, magenta, red |
|||
from reportlab.lib.units import inch |
|||
from reportlab.pdfgen.canvas import Canvas |
|||
from reportlab.pdfbase import pdfmetrics |
|||
from reportlab.pdfbase.ttfonts import TTFont |
|||
from reportlab.pdfbase.pdfmetrics import registerFontFamily |
|||
|
|||
|
|||
|
|||
# According to Wikipedia these languages are supported in the ISO-8859-1 character |
|||
# set, meaning reportlab can generate them and they are compatible with hocr, |
|||
# assuming Tesseract has the necessary languages installed. Note that there may |
|||
# not be language packs for them. |
|||
HOCR_OK_LANGS = frozenset( |
|||
[ |
|||
# Languages fully covered by Latin-1: |
|||
'afr', # Afrikaans |
|||
'alb', # Albanian |
|||
'ast', # Leonese |
|||
'baq', # Basque |
|||
'bre', # Breton |
|||
'cos', # Corsican |
|||
'eng', # English |
|||
'eus', # Basque |
|||
'fao', # Faoese |
|||
'gla', # Scottish Gaelic |
|||
'glg', # Galician |
|||
'glv', # Manx |
|||
'ice', # Icelandic |
|||
'ind', # Indonesian |
|||
'isl', # Icelandic |
|||
'ita', # Italian |
|||
'ltz', # Luxembourgish |
|||
'mal', # Malay Rumi |
|||
'mga', # Irish |
|||
'nor', # Norwegian |
|||
'oci', # Occitan |
|||
'por', # Portugeuse |
|||
'roh', # Romansh |
|||
'sco', # Scots |
|||
'sma', # Sami |
|||
'spa', # Spanish |
|||
'sqi', # Albanian |
|||
'swa', # Swahili |
|||
'swe', # Swedish |
|||
'tgl', # Tagalog |
|||
'wln', # Walloon |
|||
# Languages supported by Latin-1 except for a few rare characters that OCR |
|||
# is probably not trained to recognize anyway: |
|||
'cat', # Catalan |
|||
'cym', # Welsh |
|||
'dan', # Danish |
|||
'deu', # German |
|||
'dut', # Dutch |
|||
'est', # Estonian |
|||
'fin', # Finnish |
|||
'fra', # French |
|||
'hun', # Hungarian |
|||
'kur', # Kurdish |
|||
'nld', # Dutch |
|||
'wel', # Welsh |
|||
] |
|||
) |
|||
|
|||
|
|||
Element = ElementTree.Element |
|||
|
|||
|
|||
class Rect(NamedTuple): # pylint: disable=inherit-non-class |
|||
"""A rectangle for managing PDF coordinates.""" |
|||
|
|||
x1: Any |
|||
y1: Any |
|||
x2: Any |
|||
y2: Any |
|||
|
|||
|
|||
class HocrTransformError(Exception): |
|||
pass |
|||
|
|||
|
|||
class HocrTransform: |
|||
|
|||
""" |
|||
A class for converting documents from the hOCR format. |
|||
For details of the hOCR format, see: |
|||
http://kba.cloud/hocr-spec/ |
|||
""" |
|||
|
|||
box_pattern = re.compile(r'bbox((\s+\d+){4})') |
|||
baseline_pattern = re.compile( |
|||
r''' |
|||
baseline \s+ |
|||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float |
|||
([\-\+]?\d+) # +/- int''', |
|||
re.VERBOSE, |
|||
) |
|||
ligatures = str.maketrans( |
|||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
|||
) |
|||
|
|||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): |
|||
self.dpi = dpi |
|||
self.hocr = ElementTree.parse(os.fspath(hocr_filename)) |
|||
|
|||
# if the hOCR file has a namespace, ElementTree requires its use to |
|||
# find elements |
|||
matches = re.match(r'({.*})html', self.hocr.getroot().tag) |
|||
self.xmlns = '' |
|||
if matches: |
|||
self.xmlns = matches.group(1) |
|||
|
|||
# get dimension in pt (not pixel!!!!) of the OCRed image |
|||
self.width, self.height = None, None |
|||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): |
|||
coords = self.element_coordinates(div) |
|||
pt_coords = self.pt_from_pixel(coords) |
|||
self.width = pt_coords.x2 - pt_coords.x1 |
|||
self.height = pt_coords.y2 - pt_coords.y1 |
|||
# there shouldn't be more than one, and if there is, we don't want |
|||
# it |
|||
break |
|||
if self.width is None or self.height is None: |
|||
raise HocrTransformError("hocr file is missing page dimensions") |
|||
|
|||
def __str__(self): # pragma: no cover |
|||
""" |
|||
Return the textual content of the HTML body |
|||
""" |
|||
if self.hocr is None: |
|||
return '' |
|||
body = self.hocr.find(self._child_xpath('body')) |
|||
if body: |
|||
return self._get_element_text(body) |
|||
else: |
|||
return '' |
|||
|
|||
def _get_element_text(self, element: Element): |
|||
""" |
|||
Return the textual content of the element and its children |
|||
""" |
|||
text = '' |
|||
if element.text is not None: |
|||
text += element.text |
|||
for child in element: |
|||
text += self._get_element_text(child) |
|||
if element.tail is not None: |
|||
text += element.tail |
|||
return text |
|||
|
|||
@classmethod |
|||
def element_coordinates(cls, element: Element) -> Rect: |
|||
""" |
|||
Returns a tuple containing the coordinates of the bounding box around |
|||
an element |
|||
""" |
|||
out = Rect._make(0 for _ in range(4)) |
|||
if 'title' in element.attrib: |
|||
matches = cls.box_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
coords = matches.group(1).split() |
|||
out = Rect._make(int(coords[n]) for n in range(4)) |
|||
return out |
|||
|
|||
@classmethod |
|||
def baseline(cls, element: Element) -> Tuple[float, float]: |
|||
""" |
|||
Returns a tuple containing the baseline slope and intercept. |
|||
""" |
|||
if 'title' in element.attrib: |
|||
matches = cls.baseline_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
return float(matches.group(1)), int(matches.group(2)) |
|||
return (0.0, 0.0) |
|||
|
|||
def pt_from_pixel(self, pxl) -> Rect: |
|||
""" |
|||
Returns the quantity in PDF units (pt) given quantity in pixels |
|||
""" |
|||
return Rect._make((c / self.dpi * inch) for c in pxl) |
|||
|
|||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: |
|||
xpath = f".//{self.xmlns}{html_tag}" |
|||
if html_class: |
|||
xpath += f"[@class='{html_class}']" |
|||
return xpath |
|||
|
|||
@classmethod |
|||
def replace_unsupported_chars(cls, s: str) -> str: |
|||
""" |
|||
Given an input string, returns the corresponding string that: |
|||
* is available in the Helvetica facetype |
|||
* does not contain any ligature (to allow easy search in the PDF file) |
|||
""" |
|||
return s.translate(cls.ligatures) |
|||
|
|||
def topdown_position(self, element): |
|||
pxl_line_coords = self.element_coordinates(element) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis |
|||
# is the top of the page and increasing values of y will move towards the |
|||
# bottom of the page. |
|||
return line_box.y2 |
|||
|
|||
def to_pdf( |
|||
self, |
|||
*, |
|||
out_filename: Path, |
|||
image_filename: Optional[Path] = None, |
|||
show_bounding_boxes: bool = False, |
|||
fontname: str = "Helvetica", |
|||
invisible_text: bool = False, |
|||
interword_spaces: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates a PDF file with an image superimposed on top of the text. |
|||
Text is positioned according to the bounding box of the lines in |
|||
the hOCR file. |
|||
The image need not be identical to the image used to create the hOCR |
|||
file. |
|||
It can have a lower resolution, different color mode, etc. |
|||
|
|||
Arguments: |
|||
out_filename: Path of PDF to write. |
|||
image_filename: Image to use for this file. If omitted, the OCR text |
|||
is shown. |
|||
show_bounding_boxes: Show bounding boxes around various text regions, |
|||
for debugging. |
|||
fontname: Name of font to use. |
|||
invisible_text: If True, text is rendered invisible so that is |
|||
selectable but never drawn. If False, text is visible and may |
|||
be seen if the image is skipped or deleted in Acrobat. |
|||
interword_spaces: If True, insert spaces between words rather than |
|||
drawing each word without spaces. Generally this improves text |
|||
extraction. |
|||
""" |
|||
# create the PDF file |
|||
# page size in points (1/72 in.) |
|||
|
|||
pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) |
|||
pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) |
|||
registerFontFamily('Greek', normal='Greek', bold='GreekB') |
|||
|
|||
pdf = Canvas( |
|||
os.fspath(out_filename), |
|||
pagesize=(self.width, self.height), |
|||
pageCompression=1, |
|||
) |
|||
|
|||
if image_filename is not None: |
|||
pdf.drawImage( |
|||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height |
|||
) |
|||
|
|||
# draw bounding box for each paragraph |
|||
# light blue for bounding box of paragraph |
|||
pdf.setStrokeColor(black) |
|||
# light blue for bounding box of paragraph |
|||
pdf.setFillColor(black) |
|||
pdf.setLineWidth(1) # no line for bounding box |
|||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): |
|||
elemtxt = self._get_element_text(elem).rstrip() |
|||
if len(elemtxt) == 0: |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
pt = self.pt_from_pixel(pxl_coords) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 |
|||
) |
|||
|
|||
found_lines = False |
|||
for line in sorted( |
|||
chain( |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), |
|||
), |
|||
key=self.topdown_position, |
|||
): |
|||
found_lines = True |
|||
self._do_line( |
|||
pdf, |
|||
line, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
|
|||
if not found_lines: |
|||
# Tesseract did not report any lines (just words) |
|||
root = self.hocr.find(self._child_xpath('div', 'ocr_page')) |
|||
self._do_line( |
|||
pdf, |
|||
root, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
# put the image on the page, scaled to fill the page |
|||
|
|||
|
|||
# finish up the page and save it |
|||
pdf.showPage() |
|||
pdf.save() |
|||
|
|||
@classmethod |
|||
def polyval(cls, poly, x): # pragma: no cover |
|||
return x * poly[0] + poly[1] |
|||
|
|||
def _do_line( |
|||
self, |
|||
pdf: Canvas, |
|||
line: Optional[Element], |
|||
elemclass: str, |
|||
fontname: str, |
|||
invisible_text: bool, |
|||
interword_spaces: bool, |
|||
show_bounding_boxes: bool, |
|||
): |
|||
if not line: |
|||
return |
|||
pxl_line_coords = self.element_coordinates(line) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
line_height = line_box.y2 - line_box.y1 |
|||
|
|||
slope, pxl_intercept = self.baseline(line) |
|||
if abs(slope) < 0.005: |
|||
slope = 0.0 |
|||
angle = atan(slope) |
|||
cos_a, sin_a = cos(angle), sin(angle) |
|||
|
|||
text = pdf.beginText() |
|||
intercept = pxl_intercept / self.dpi * inch |
|||
|
|||
# Don't allow the font to break out of the bounding box. Division by |
|||
# cos_a accounts for extra clearance between the glyph's vertical axis |
|||
# on a sloped baseline and the edge of the bounding box. |
|||
fontsize = (line_height - abs(intercept)) / cos_a * 1.2 |
|||
#fontsize = 10.5 |
|||
text.setFont('Greek', fontsize) |
|||
#if invisible_text: |
|||
# text.setTextRenderMode(3) # Invisible (indicates OCR text) |
|||
|
|||
# Intercept is normally negative, so this places it above the bottom |
|||
# of the line box |
|||
baseline_y2 = self.height - (line_box.y2 + intercept) |
|||
|
|||
if False: # pragma: no cover |
|||
# draw the baseline in magenta, dashed |
|||
pdf.setDash() |
|||
pdf.setStrokeColor(magenta) |
|||
pdf.setLineWidth(0.5) |
|||
# negate slope because it is defined as a rise/run in pixel |
|||
# coordinates and page coordinates have the y axis flipped |
|||
pdf.line( |
|||
line_box.x1, |
|||
baseline_y2, |
|||
line_box.x2, |
|||
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), |
|||
) |
|||
# light green for bounding box of word/line |
|||
pdf.setDash(6, 3) |
|||
pdf.setStrokeColor(red) |
|||
|
|||
#text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) |
|||
text.setTextOrigin(line_box.x1, baseline_y2) |
|||
##pdf.translate(line_box.x1, baseline_y2) |
|||
pdf.setFillColor(black) # text in black |
|||
|
|||
elements = line.findall(self._child_xpath('span', elemclass)) |
|||
for elem in elements: |
|||
elemtxt = self._get_element_text(elem).strip() |
|||
elemtxt = self.replace_unsupported_chars(elemtxt) |
|||
if elemtxt == '': |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
box = self.pt_from_pixel(pxl_coords) |
|||
if False: |
|||
# if `--interword-spaces` is true, append a space |
|||
# to the end of each text element to allow simpler PDF viewers |
|||
# such as PDF.js to better recognize words in search and copy |
|||
# and paste. Do not remove space from last word in line, even |
|||
# though it would look better, because it will interfere with |
|||
# naive text extraction. \n does not work either. |
|||
elemtxt += ' ' |
|||
box = Rect._make( |
|||
( |
|||
box.x1, |
|||
line_box.y1, |
|||
box.x2 + pdf.stringWidth(' ', fontname, line_height), |
|||
line_box.y2, |
|||
) |
|||
) |
|||
box_width = box.x2 - box.x1 |
|||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
|||
|
|||
# draw the bbox border |
|||
if False: # pragma: no cover |
|||
pdf.rect( |
|||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0 |
|||
) |
|||
|
|||
# Adjust relative position of cursor |
|||
# This is equivalent to: |
|||