Angeliki
3 years ago
5 changed files with 1163 additions and 0 deletions
@ -1,3 +1,4 @@ |
|||
pdf/* |
|||
images/* |
|||
audio/* |
|||
scripts/venv/* |
|||
|
@ -0,0 +1,500 @@ |
|||
#!/usr/bin/env python3 |
|||
# |
|||
# Copyright (c) 2010, Jonathan Brinley |
|||
# Original version from: https://github.com/jbrinley/HocrConverter |
|||
# |
|||
# Copyright (c) 2013-14, Julien Pfefferkorn |
|||
# Modifications |
|||
# |
|||
# Copyright (c) 2015-16, James R. Barlow |
|||
# Set text to transparent |
|||
# |
|||
# Permission is hereby granted, free of charge, to any person obtaining a |
|||
# copy of this software and associated documentation files (the |
|||
# "Software"), to deal in the Software without restriction, including |
|||
# without limitation the rights to use, copy, modify, merge, publish, |
|||
# distribute, sublicense, and/or sell copies of the Software, and to |
|||
# permit persons to whom the Software is furnished to do so, subject to |
|||
# the following conditions: |
|||
# |
|||
# The above copyright notice and this permission notice shall be included |
|||
# in all copies or substantial portions of the Software. |
|||
# |
|||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|||
|
|||
import argparse |
|||
import os |
|||
import re |
|||
from itertools import chain |
|||
from math import atan, cos, sin |
|||
from pathlib import Path |
|||
from typing import Any, NamedTuple, Optional, Tuple, Union |
|||
from xml.etree import ElementTree |
|||
|
|||
from reportlab.lib.colors import black, cyan, magenta, red |
|||
from reportlab.lib.units import inch |
|||
from reportlab.pdfgen.canvas import Canvas |
|||
|
|||
# According to Wikipedia these languages are supported in the ISO-8859-1 character |
|||
# set, meaning reportlab can generate them and they are compatible with hocr, |
|||
# assuming Tesseract has the necessary languages installed. Note that there may |
|||
# not be language packs for them. |
|||
HOCR_OK_LANGS = frozenset( |
|||
[ |
|||
# Languages fully covered by Latin-1: |
|||
'afr', # Afrikaans |
|||
'alb', # Albanian |
|||
'ast', # Leonese |
|||
'baq', # Basque |
|||
'bre', # Breton |
|||
'cos', # Corsican |
|||
'eng', # English |
|||
'eus', # Basque |
|||
'fao', # Faoese |
|||
'gla', # Scottish Gaelic |
|||
'glg', # Galician |
|||
'glv', # Manx |
|||
'ice', # Icelandic |
|||
'ind', # Indonesian |
|||
'isl', # Icelandic |
|||
'ita', # Italian |
|||
'ltz', # Luxembourgish |
|||
'mal', # Malay Rumi |
|||
'mga', # Irish |
|||
'nor', # Norwegian |
|||
'oci', # Occitan |
|||
'por', # Portugeuse |
|||
'roh', # Romansh |
|||
'sco', # Scots |
|||
'sma', # Sami |
|||
'spa', # Spanish |
|||
'sqi', # Albanian |
|||
'swa', # Swahili |
|||
'swe', # Swedish |
|||
'tgl', # Tagalog |
|||
'wln', # Walloon |
|||
# Languages supported by Latin-1 except for a few rare characters that OCR |
|||
# is probably not trained to recognize anyway: |
|||
'cat', # Catalan |
|||
'cym', # Welsh |
|||
'dan', # Danish |
|||
'deu', # German |
|||
'dut', # Dutch |
|||
'est', # Estonian |
|||
'fin', # Finnish |
|||
'fra', # French |
|||
'hun', # Hungarian |
|||
'kur', # Kurdish |
|||
'nld', # Dutch |
|||
'wel', # Welsh |
|||
] |
|||
) |
|||
|
|||
|
|||
Element = ElementTree.Element |
|||
|
|||
|
|||
class Rect(NamedTuple): # pylint: disable=inherit-non-class |
|||
"""A rectangle for managing PDF coordinates.""" |
|||
|
|||
x1: Any |
|||
y1: Any |
|||
x2: Any |
|||
y2: Any |
|||
|
|||
|
|||
class HocrTransformError(Exception): |
|||
pass |
|||
|
|||
|
|||
class HocrTransform: |
|||
|
|||
""" |
|||
A class for converting documents from the hOCR format. |
|||
For details of the hOCR format, see: |
|||
http://kba.cloud/hocr-spec/ |
|||
""" |
|||
|
|||
box_pattern = re.compile(r'bbox((\s+\d+){4})') |
|||
baseline_pattern = re.compile( |
|||
r''' |
|||
baseline \s+ |
|||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float |
|||
([\-\+]?\d+) # +/- int''', |
|||
re.VERBOSE, |
|||
) |
|||
ligatures = str.maketrans( |
|||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
|||
) |
|||
|
|||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): |
|||
self.dpi = dpi |
|||
self.hocr = ElementTree.parse(os.fspath(hocr_filename)) |
|||
|
|||
# if the hOCR file has a namespace, ElementTree requires its use to |
|||
# find elements |
|||
matches = re.match(r'({.*})html', self.hocr.getroot().tag) |
|||
self.xmlns = '' |
|||
if matches: |
|||
self.xmlns = matches.group(1) |
|||
|
|||
# get dimension in pt (not pixel!!!!) of the OCRed image |
|||
self.width, self.height = None, None |
|||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): |
|||
coords = self.element_coordinates(div) |
|||
pt_coords = self.pt_from_pixel(coords) |
|||
self.width = pt_coords.x2 - pt_coords.x1 |
|||
self.height = pt_coords.y2 - pt_coords.y1 |
|||
# there shouldn't be more than one, and if there is, we don't want |
|||
# it |
|||
break |
|||
if self.width is None or self.height is None: |
|||
raise HocrTransformError("hocr file is missing page dimensions") |
|||
|
|||
def __str__(self): # pragma: no cover |
|||
""" |
|||
Return the textual content of the HTML body |
|||
""" |
|||
if self.hocr is None: |
|||
return '' |
|||
body = self.hocr.find(self._child_xpath('body')) |
|||
if body: |
|||
return self._get_element_text(body) |
|||
else: |
|||
return '' |
|||
|
|||
def _get_element_text(self, element: Element): |
|||
""" |
|||
Return the textual content of the element and its children |
|||
""" |
|||
text = '' |
|||
if element.text is not None: |
|||
text += element.text |
|||
for child in element: |
|||
text += self._get_element_text(child) |
|||
if element.tail is not None: |
|||
text += element.tail |
|||
return text |
|||
|
|||
@classmethod |
|||
def element_coordinates(cls, element: Element) -> Rect: |
|||
""" |
|||
Returns a tuple containing the coordinates of the bounding box around |
|||
an element |
|||
""" |
|||
out = Rect._make(0 for _ in range(4)) |
|||
if 'title' in element.attrib: |
|||
matches = cls.box_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
coords = matches.group(1).split() |
|||
out = Rect._make(int(coords[n]) for n in range(4)) |
|||
return out |
|||
|
|||
@classmethod |
|||
def baseline(cls, element: Element) -> Tuple[float, float]: |
|||
""" |
|||
Returns a tuple containing the baseline slope and intercept. |
|||
""" |
|||
if 'title' in element.attrib: |
|||
matches = cls.baseline_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
return float(matches.group(1)), int(matches.group(2)) |
|||
return (0.0, 0.0) |
|||
|
|||
def pt_from_pixel(self, pxl) -> Rect: |
|||
""" |
|||
Returns the quantity in PDF units (pt) given quantity in pixels |
|||
""" |
|||
return Rect._make((c / self.dpi * inch) for c in pxl) |
|||
|
|||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: |
|||
xpath = f".//{self.xmlns}{html_tag}" |
|||
if html_class: |
|||
xpath += f"[@class='{html_class}']" |
|||
return xpath |
|||
|
|||
@classmethod |
|||
def replace_unsupported_chars(cls, s: str) -> str: |
|||
""" |
|||
Given an input string, returns the corresponding string that: |
|||
* is available in the Helvetica facetype |
|||
* does not contain any ligature (to allow easy search in the PDF file) |
|||
""" |
|||
return s.translate(cls.ligatures) |
|||
|
|||
def topdown_position(self, element): |
|||
pxl_line_coords = self.element_coordinates(element) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis |
|||
# is the top of the page and increasing values of y will move towards the |
|||
# bottom of the page. |
|||
return line_box.y2 |
|||
|
|||
def to_pdf( |
|||
self, |
|||
*, |
|||
out_filename: Path, |
|||
image_filename: Optional[Path] = None, |
|||
show_bounding_boxes: bool = False, |
|||
fontname: str = "Helvetica", |
|||
invisible_text: bool = False, |
|||
interword_spaces: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates a PDF file with an image superimposed on top of the text. |
|||
Text is positioned according to the bounding box of the lines in |
|||
the hOCR file. |
|||
The image need not be identical to the image used to create the hOCR |
|||
file. |
|||
It can have a lower resolution, different color mode, etc. |
|||
|
|||
Arguments: |
|||
out_filename: Path of PDF to write. |
|||
image_filename: Image to use for this file. If omitted, the OCR text |
|||
is shown. |
|||
show_bounding_boxes: Show bounding boxes around various text regions, |
|||
for debugging. |
|||
fontname: Name of font to use. |
|||
invisible_text: If True, text is rendered invisible so that is |
|||
selectable but never drawn. If False, text is visible and may |
|||
be seen if the image is skipped or deleted in Acrobat. |
|||
interword_spaces: If True, insert spaces between words rather than |
|||
drawing each word without spaces. Generally this improves text |
|||
extraction. |
|||
""" |
|||
# create the PDF file |
|||
# page size in points (1/72 in.) |
|||
pdf = Canvas( |
|||
os.fspath(out_filename), |
|||
pagesize=(self.width, self.height), |
|||
pageCompression=1, |
|||
) |
|||
|
|||
# draw bounding box for each paragraph |
|||
# light blue for bounding box of paragraph |
|||
pdf.setStrokeColor(cyan) |
|||
# light blue for bounding box of paragraph |
|||
pdf.setFillColor(cyan) |
|||
pdf.setLineWidth(0) # no line for bounding box |
|||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): |
|||
elemtxt = self._get_element_text(elem).rstrip() |
|||
if len(elemtxt) == 0: |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
pt = self.pt_from_pixel(pxl_coords) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 |
|||
) |
|||
|
|||
found_lines = False |
|||
for line in sorted( |
|||
chain( |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), |
|||
), |
|||
key=self.topdown_position, |
|||
): |
|||
found_lines = True |
|||
self._do_line( |
|||
pdf, |
|||
line, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
|
|||
if not found_lines: |
|||
# Tesseract did not report any lines (just words) |
|||
root = self.hocr.find(self._child_xpath('div', 'ocr_page')) |
|||
self._do_line( |
|||
pdf, |
|||
root, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
# put the image on the page, scaled to fill the page |
|||
if image_filename is not None: |
|||
pdf.drawImage( |
|||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height |
|||
) |
|||
|
|||
# finish up the page and save it |
|||
pdf.showPage() |
|||
pdf.save() |
|||
|
|||
@classmethod |
|||
def polyval(cls, poly, x): # pragma: no cover |
|||
return x * poly[0] + poly[1] |
|||
|
|||
def _do_line( |
|||
self, |
|||
pdf: Canvas, |
|||
line: Optional[Element], |
|||
elemclass: str, |
|||
fontname: str, |
|||
invisible_text: bool, |
|||
interword_spaces: bool, |
|||
show_bounding_boxes: bool, |
|||
): |
|||
if not line: |
|||
return |
|||
pxl_line_coords = self.element_coordinates(line) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
line_height = line_box.y2 - line_box.y1 |
|||
|
|||
slope, pxl_intercept = self.baseline(line) |
|||
if abs(slope) < 0.005: |
|||
slope = 0.0 |
|||
angle = atan(slope) |
|||
cos_a, sin_a = cos(angle), sin(angle) |
|||
|
|||
text = pdf.beginText() |
|||
intercept = pxl_intercept / self.dpi * inch |
|||
|
|||
# Don't allow the font to break out of the bounding box. Division by |
|||
# cos_a accounts for extra clearance between the glyph's vertical axis |
|||
# on a sloped baseline and the edge of the bounding box. |
|||
fontsize = (line_height - abs(intercept)) / cos_a |
|||
text.setFont(fontname, fontsize) |
|||
if invisible_text: |
|||
text.setTextRenderMode(3) # Invisible (indicates OCR text) |
|||
|
|||
# Intercept is normally negative, so this places it above the bottom |
|||
# of the line box |
|||
baseline_y2 = self.height - (line_box.y2 + intercept) |
|||
|
|||
if show_bounding_boxes: # pragma: no cover |
|||
# draw the baseline in magenta, dashed |
|||
pdf.setDash() |
|||
pdf.setStrokeColor(magenta) |
|||
pdf.setLineWidth(0.5) |
|||
# negate slope because it is defined as a rise/run in pixel |
|||
# coordinates and page coordinates have the y axis flipped |
|||
pdf.line( |
|||
line_box.x1, |
|||
baseline_y2, |
|||
line_box.x2, |
|||
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), |
|||
) |
|||
# light green for bounding box of word/line |
|||
pdf.setDash(6, 3) |
|||
pdf.setStrokeColor(red) |
|||
|
|||
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) |
|||
pdf.setFillColor(black) # text in black |
|||
|
|||
elements = line.findall(self._child_xpath('span', elemclass)) |
|||
for elem in elements: |
|||
elemtxt = self._get_element_text(elem).strip() |
|||
elemtxt = self.replace_unsupported_chars(elemtxt) |
|||
if elemtxt == '': |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
box = self.pt_from_pixel(pxl_coords) |
|||
if interword_spaces: |
|||
# if `--interword-spaces` is true, append a space |
|||
# to the end of each text element to allow simpler PDF viewers |
|||
# such as PDF.js to better recognize words in search and copy |
|||
# and paste. Do not remove space from last word in line, even |
|||
# though it would look better, because it will interfere with |
|||
# naive text extraction. \n does not work either. |
|||
elemtxt += ' ' |
|||
box = Rect._make( |
|||
( |
|||
box.x1, |
|||
line_box.y1, |
|||
box.x2 + pdf.stringWidth(' ', fontname, line_height), |
|||
line_box.y2, |
|||
) |
|||
) |
|||
box_width = box.x2 - box.x1 |
|||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0 |
|||
) |
|||
|
|||
# Adjust relative position of cursor |
|||
# This is equivalent to: |
|||
# text.setTextOrigin(pt.x1, self.height - line_box.y2) |
|||
# but the former generates a full text reposition matrix (Tm) in the |
|||
# content stream while this issues a "offset" (Td) command. |
|||
# .moveCursor() is relative to start of the text line, where the |
|||
# "text line" means whatever reportlab defines it as. Do not use |
|||
# use .getCursor(), since moveCursor() rather unintuitively plans |
|||
# its moves relative to .getStartOfLine(). |
|||
# For skewed lines, in the text transform we set up a rotated |
|||
# coordinate system, so we don't have to account for the |
|||
# incremental offset. Surprisingly most PDF viewers can handle this. |
|||
cursor = text.getStartOfLine() |
|||
dx = box.x1 - cursor[0] |
|||
dy = baseline_y2 - cursor[1] |
|||
text.moveCursor(dx, dy) |
|||
|
|||
# If reportlab tells us this word is 0 units wide, our best seems |
|||
# to be to suppress this text |
|||
if font_width > 0: |
|||
text.setHorizScale(100 * box_width / font_width) |
|||
text.textOut(elemtxt) |
|||
pdf.drawText(text) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
parser = argparse.ArgumentParser(description='Convert hocr file to PDF') |
|||
parser.add_argument( |
|||
'-b', |
|||
'--boundingboxes', |
|||
action="store_true", |
|||
default=False, |
|||
help='Show bounding boxes borders', |
|||
) |
|||
parser.add_argument( |
|||
'-r', |
|||
'--resolution', |
|||
type=int, |
|||
default=300, |
|||
help='Resolution of the image that was OCRed', |
|||
) |
|||
parser.add_argument( |
|||
'-i', |
|||
'--image', |
|||
default=None, |
|||
help='Path to the image to be placed above the text', |
|||
) |
|||
parser.add_argument( |
|||
'--interword-spaces', |
|||
action='store_true', |
|||
default=False, |
|||
help='Add spaces between words', |
|||
) |
|||
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') |
|||
parser.add_argument('outputfile', help='Path to the PDF file to be generated') |
|||
args = parser.parse_args() |
|||
|
|||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) |
|||
hocr.to_pdf( |
|||
out_filename=args.outputfile, |
|||
image_filename=args.image, |
|||
show_bounding_boxes=args.boundingboxes, |
|||
interword_spaces=args.interword_spaces, |
|||
) |
|||
|
@ -0,0 +1,518 @@ |
|||
#!venv/bin python3 |
|||
# |
|||
# Copyright (c) 2010, Jonathan Brinley |
|||
# Original version from: https://github.com/jbrinley/HocrConverter |
|||
# |
|||
# Copyright (c) 2013-14, Julien Pfefferkorn |
|||
# Modifications |
|||
# |
|||
# Copyright (c) 2015-16, James R. Barlow |
|||
# Set text to transparent |
|||
|
|||
# Copyright (c) 2022, WordMord & Alex Roidl |
|||
# Set text back to visible and change bounding boxes |
|||
# |
|||
# Permission is hereby granted, free of charge, to any person obtaining a |
|||
# copy of this software and associated documentation files (the |
|||
# "Software"), to deal in the Software without restriction, including |
|||
# without limitation the rights to use, copy, modify, merge, publish, |
|||
# distribute, sublicense, and/or sell copies of the Software, and to |
|||
# permit persons to whom the Software is furnished to do so, subject to |
|||
# the following conditions: |
|||
# |
|||
# The above copyright notice and this permission notice shall be included |
|||
# in all copies or substantial portions of the Software. |
|||
# |
|||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
|||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|||
|
|||
import argparse |
|||
import os |
|||
import re |
|||
from itertools import chain |
|||
from math import atan, cos, sin |
|||
from pathlib import Path |
|||
from typing import Any, NamedTuple, Optional, Tuple, Union |
|||
from xml.etree import ElementTree |
|||
|
|||
from reportlab.lib.colors import black, cyan, magenta, red |
|||
from reportlab.lib.units import inch |
|||
from reportlab.pdfgen.canvas import Canvas |
|||
from reportlab.pdfbase import pdfmetrics |
|||
from reportlab.pdfbase.ttfonts import TTFont |
|||
from reportlab.pdfbase.pdfmetrics import registerFontFamily |
|||
|
|||
|
|||
|
|||
# According to Wikipedia these languages are supported in the ISO-8859-1 character |
|||
# set, meaning reportlab can generate them and they are compatible with hocr, |
|||
# assuming Tesseract has the necessary languages installed. Note that there may |
|||
# not be language packs for them. |
|||
HOCR_OK_LANGS = frozenset( |
|||
[ |
|||
# Languages fully covered by Latin-1: |
|||
'afr', # Afrikaans |
|||
'alb', # Albanian |
|||
'ast', # Leonese |
|||
'baq', # Basque |
|||
'bre', # Breton |
|||
'cos', # Corsican |
|||
'eng', # English |
|||
'eus', # Basque |
|||
'fao', # Faoese |
|||
'gla', # Scottish Gaelic |
|||
'glg', # Galician |
|||
'glv', # Manx |
|||
'ice', # Icelandic |
|||
'ind', # Indonesian |
|||
'isl', # Icelandic |
|||
'ita', # Italian |
|||
'ltz', # Luxembourgish |
|||
'mal', # Malay Rumi |
|||
'mga', # Irish |
|||
'nor', # Norwegian |
|||
'oci', # Occitan |
|||
'por', # Portugeuse |
|||
'roh', # Romansh |
|||
'sco', # Scots |
|||
'sma', # Sami |
|||
'spa', # Spanish |
|||
'sqi', # Albanian |
|||
'swa', # Swahili |
|||
'swe', # Swedish |
|||
'tgl', # Tagalog |
|||
'wln', # Walloon |
|||
# Languages supported by Latin-1 except for a few rare characters that OCR |
|||
# is probably not trained to recognize anyway: |
|||
'cat', # Catalan |
|||
'cym', # Welsh |
|||
'dan', # Danish |
|||
'deu', # German |
|||
'dut', # Dutch |
|||
'est', # Estonian |
|||
'fin', # Finnish |
|||
'fra', # French |
|||
'hun', # Hungarian |
|||
'kur', # Kurdish |
|||
'nld', # Dutch |
|||
'wel', # Welsh |
|||
] |
|||
) |
|||
|
|||
|
|||
Element = ElementTree.Element |
|||
|
|||
|
|||
class Rect(NamedTuple): # pylint: disable=inherit-non-class |
|||
"""A rectangle for managing PDF coordinates.""" |
|||
|
|||
x1: Any |
|||
y1: Any |
|||
x2: Any |
|||
y2: Any |
|||
|
|||
|
|||
class HocrTransformError(Exception): |
|||
pass |
|||
|
|||
|
|||
class HocrTransform: |
|||
|
|||
""" |
|||
A class for converting documents from the hOCR format. |
|||
For details of the hOCR format, see: |
|||
http://kba.cloud/hocr-spec/ |
|||
""" |
|||
|
|||
box_pattern = re.compile(r'bbox((\s+\d+){4})') |
|||
baseline_pattern = re.compile( |
|||
r''' |
|||
baseline \s+ |
|||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float |
|||
([\-\+]?\d+) # +/- int''', |
|||
re.VERBOSE, |
|||
) |
|||
ligatures = str.maketrans( |
|||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'} |
|||
) |
|||
|
|||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): |
|||
self.dpi = dpi |
|||
self.hocr = ElementTree.parse(os.fspath(hocr_filename)) |
|||
|
|||
# if the hOCR file has a namespace, ElementTree requires its use to |
|||
# find elements |
|||
matches = re.match(r'({.*})html', self.hocr.getroot().tag) |
|||
self.xmlns = '' |
|||
if matches: |
|||
self.xmlns = matches.group(1) |
|||
|
|||
# get dimension in pt (not pixel!!!!) of the OCRed image |
|||
self.width, self.height = None, None |
|||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): |
|||
coords = self.element_coordinates(div) |
|||
pt_coords = self.pt_from_pixel(coords) |
|||
self.width = pt_coords.x2 - pt_coords.x1 |
|||
self.height = pt_coords.y2 - pt_coords.y1 |
|||
# there shouldn't be more than one, and if there is, we don't want |
|||
# it |
|||
break |
|||
if self.width is None or self.height is None: |
|||
raise HocrTransformError("hocr file is missing page dimensions") |
|||
|
|||
def __str__(self): # pragma: no cover |
|||
""" |
|||
Return the textual content of the HTML body |
|||
""" |
|||
if self.hocr is None: |
|||
return '' |
|||
body = self.hocr.find(self._child_xpath('body')) |
|||
if body: |
|||
return self._get_element_text(body) |
|||
else: |
|||
return '' |
|||
|
|||
def _get_element_text(self, element: Element): |
|||
""" |
|||
Return the textual content of the element and its children |
|||
""" |
|||
text = '' |
|||
if element.text is not None: |
|||
text += element.text |
|||
for child in element: |
|||
text += self._get_element_text(child) |
|||
if element.tail is not None: |
|||
text += element.tail |
|||
return text |
|||
|
|||
@classmethod |
|||
def element_coordinates(cls, element: Element) -> Rect: |
|||
""" |
|||
Returns a tuple containing the coordinates of the bounding box around |
|||
an element |
|||
""" |
|||
out = Rect._make(0 for _ in range(4)) |
|||
if 'title' in element.attrib: |
|||
matches = cls.box_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
coords = matches.group(1).split() |
|||
out = Rect._make(int(coords[n]) for n in range(4)) |
|||
return out |
|||
|
|||
@classmethod |
|||
def baseline(cls, element: Element) -> Tuple[float, float]: |
|||
""" |
|||
Returns a tuple containing the baseline slope and intercept. |
|||
""" |
|||
if 'title' in element.attrib: |
|||
matches = cls.baseline_pattern.search(element.attrib['title']) |
|||
if matches: |
|||
return float(matches.group(1)), int(matches.group(2)) |
|||
return (0.0, 0.0) |
|||
|
|||
def pt_from_pixel(self, pxl) -> Rect: |
|||
""" |
|||
Returns the quantity in PDF units (pt) given quantity in pixels |
|||
""" |
|||
return Rect._make((c / self.dpi * inch) for c in pxl) |
|||
|
|||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: |
|||
xpath = f".//{self.xmlns}{html_tag}" |
|||
if html_class: |
|||
xpath += f"[@class='{html_class}']" |
|||
return xpath |
|||
|
|||
@classmethod |
|||
def replace_unsupported_chars(cls, s: str) -> str: |
|||
""" |
|||
Given an input string, returns the corresponding string that: |
|||
* is available in the Helvetica facetype |
|||
* does not contain any ligature (to allow easy search in the PDF file) |
|||
""" |
|||
return s.translate(cls.ligatures) |
|||
|
|||
def topdown_position(self, element): |
|||
pxl_line_coords = self.element_coordinates(element) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis |
|||
# is the top of the page and increasing values of y will move towards the |
|||
# bottom of the page. |
|||
return line_box.y2 |
|||
|
|||
def to_pdf( |
|||
self, |
|||
*, |
|||
out_filename: Path, |
|||
image_filename: Optional[Path] = None, |
|||
show_bounding_boxes: bool = False, |
|||
fontname: str = "Helvetica", |
|||
invisible_text: bool = False, |
|||
interword_spaces: bool = False, |
|||
) -> None: |
|||
""" |
|||
Creates a PDF file with an image superimposed on top of the text. |
|||
Text is positioned according to the bounding box of the lines in |
|||
the hOCR file. |
|||
The image need not be identical to the image used to create the hOCR |
|||
file. |
|||
It can have a lower resolution, different color mode, etc. |
|||
|
|||
Arguments: |
|||
out_filename: Path of PDF to write. |
|||
image_filename: Image to use for this file. If omitted, the OCR text |
|||
is shown. |
|||
show_bounding_boxes: Show bounding boxes around various text regions, |
|||
for debugging. |
|||
fontname: Name of font to use. |
|||
invisible_text: If True, text is rendered invisible so that is |
|||
selectable but never drawn. If False, text is visible and may |
|||
be seen if the image is skipped or deleted in Acrobat. |
|||
interword_spaces: If True, insert spaces between words rather than |
|||
drawing each word without spaces. Generally this improves text |
|||
extraction. |
|||
""" |
|||
# create the PDF file |
|||
# page size in points (1/72 in.) |
|||
|
|||
pdfmetrics.registerFont(TTFont('Greek', '../styles/fonts/greek.ttf')) |
|||
pdfmetrics.registerFont(TTFont('GreekB', '../styles/fonts/greek-bold.ttf')) |
|||
registerFontFamily('Greek', normal='Greek', bold='GreekB') |
|||
|
|||
pdf = Canvas( |
|||
os.fspath(out_filename), |
|||
pagesize=(self.width, self.height), |
|||
pageCompression=1, |
|||
) |
|||
|
|||
if image_filename is not None: |
|||
pdf.drawImage( |
|||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height |
|||
) |
|||
|
|||
# draw bounding box for each paragraph |
|||
# light blue for bounding box of paragraph |
|||
pdf.setStrokeColor(black) |
|||
# light blue for bounding box of paragraph |
|||
pdf.setFillColor(black) |
|||
pdf.setLineWidth(1) # no line for bounding box |
|||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): |
|||
elemtxt = self._get_element_text(elem).rstrip() |
|||
if len(elemtxt) == 0: |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
pt = self.pt_from_pixel(pxl_coords) |
|||
|
|||
# draw the bbox border |
|||
if show_bounding_boxes: # pragma: no cover |
|||
pdf.rect( |
|||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 |
|||
) |
|||
|
|||
found_lines = False |
|||
for line in sorted( |
|||
chain( |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), |
|||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), |
|||
), |
|||
key=self.topdown_position, |
|||
): |
|||
found_lines = True |
|||
self._do_line( |
|||
pdf, |
|||
line, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
|
|||
if not found_lines: |
|||
# Tesseract did not report any lines (just words) |
|||
root = self.hocr.find(self._child_xpath('div', 'ocr_page')) |
|||
self._do_line( |
|||
pdf, |
|||
root, |
|||
"ocrx_word", |
|||
fontname, |
|||
invisible_text, |
|||
interword_spaces, |
|||
show_bounding_boxes, |
|||
) |
|||
# put the image on the page, scaled to fill the page |
|||
|
|||
|
|||
# finish up the page and save it |
|||
pdf.showPage() |
|||
pdf.save() |
|||
|
|||
@classmethod |
|||
def polyval(cls, poly, x): # pragma: no cover |
|||
return x * poly[0] + poly[1] |
|||
|
|||
def _do_line( |
|||
self, |
|||
pdf: Canvas, |
|||
line: Optional[Element], |
|||
elemclass: str, |
|||
fontname: str, |
|||
invisible_text: bool, |
|||
interword_spaces: bool, |
|||
show_bounding_boxes: bool, |
|||
): |
|||
if not line: |
|||
return |
|||
pxl_line_coords = self.element_coordinates(line) |
|||
line_box = self.pt_from_pixel(pxl_line_coords) |
|||
line_height = line_box.y2 - line_box.y1 |
|||
|
|||
slope, pxl_intercept = self.baseline(line) |
|||
if abs(slope) < 0.005: |
|||
slope = 0.0 |
|||
angle = atan(slope) |
|||
cos_a, sin_a = cos(angle), sin(angle) |
|||
|
|||
text = pdf.beginText() |
|||
intercept = pxl_intercept / self.dpi * inch |
|||
|
|||
# Don't allow the font to break out of the bounding box. Division by |
|||
# cos_a accounts for extra clearance between the glyph's vertical axis |
|||
# on a sloped baseline and the edge of the bounding box. |
|||
fontsize = (line_height - abs(intercept)) / cos_a * 1.2 |
|||
#fontsize = 10.5 |
|||
text.setFont('Greek', fontsize) |
|||
#if invisible_text: |
|||
# text.setTextRenderMode(3) # Invisible (indicates OCR text) |
|||
|
|||
# Intercept is normally negative, so this places it above the bottom |
|||
# of the line box |
|||
baseline_y2 = self.height - (line_box.y2 + intercept) |
|||
|
|||
if False: # pragma: no cover |
|||
# draw the baseline in magenta, dashed |
|||
pdf.setDash() |
|||
pdf.setStrokeColor(magenta) |
|||
pdf.setLineWidth(0.5) |
|||
# negate slope because it is defined as a rise/run in pixel |
|||
# coordinates and page coordinates have the y axis flipped |
|||
pdf.line( |
|||
line_box.x1, |
|||
baseline_y2, |
|||
line_box.x2, |
|||
self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), |
|||
) |
|||
# light green for bounding box of word/line |
|||
pdf.setDash(6, 3) |
|||
pdf.setStrokeColor(red) |
|||
|
|||
#text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) |
|||
text.setTextOrigin(line_box.x1, baseline_y2) |
|||
##pdf.translate(line_box.x1, baseline_y2) |
|||
pdf.setFillColor(black) # text in black |
|||
|
|||
elements = line.findall(self._child_xpath('span', elemclass)) |
|||
for elem in elements: |
|||
elemtxt = self._get_element_text(elem).strip() |
|||
elemtxt = self.replace_unsupported_chars(elemtxt) |
|||
if elemtxt == '': |
|||
continue |
|||
|
|||
pxl_coords = self.element_coordinates(elem) |
|||
box = self.pt_from_pixel(pxl_coords) |
|||
if False: |
|||
# if `--interword-spaces` is true, append a space |
|||
# to the end of each text element to allow simpler PDF viewers |
|||
# such as PDF.js to better recognize words in search and copy |
|||
# and paste. Do not remove space from last word in line, even |
|||
# though it would look better, because it will interfere with |
|||
# naive text extraction. \n does not work either. |
|||
elemtxt += ' ' |
|||
box = Rect._make( |
|||
( |
|||
box.x1, |
|||
line_box.y1, |
|||
box.x2 + pdf.stringWidth(' ', fontname, line_height), |
|||
line_box.y2, |
|||
) |
|||
) |
|||
box_width = box.x2 - box.x1 |
|||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize) |
|||
|
|||
# draw the bbox border |
|||
if False: # pragma: no cover |
|||
pdf.rect( |
|||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0 |
|||
) |
|||
|
|||
# Adjust relative position of cursor |
|||
# This is equivalent to: |
|||
# text.setTextOrigin(pt.x1, self.height - line_box.y2) |
|||
# but the former generates a full text reposition matrix (Tm) in the |
|||
# content stream while this issues a "offset" (Td) command. |
|||
# .moveCursor() is relative to start of the text line, where the |
|||
# "text line" means whatever reportlab defines it as. Do not use |
|||
# use .getCursor(), since moveCursor() rather unintuitively plans |
|||
# its moves relative to .getStartOfLine(). |
|||
# For skewed lines, in the text transform we set up a rotated |
|||
# coordinate system, so we don't have to account for the |
|||
# incremental offset. Surprisingly most PDF viewers can handle this. |
|||
cursor = text.getStartOfLine() |
|||
dx = box.x1 - cursor[0] |
|||
dy = baseline_y2 - cursor[1] |
|||
text.moveCursor(dx, dy) |
|||
|
|||
# If reportlab tells us this word is 0 units wide, our best seems |
|||
# to be to suppress this text |
|||
if font_width > 0: |
|||
#text.setHorizScale(100 * box_width / font_width) |
|||
text.textOut(elemtxt) |
|||
pdf.drawText(text) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
parser = argparse.ArgumentParser(description='Convert hocr file to PDF') |
|||
parser.add_argument( |
|||
'-b', |
|||
'--boundingboxes', |
|||
action="store_true", |
|||
default=False, |
|||
help='Show bounding boxes borders', |
|||
) |
|||
parser.add_argument( |
|||
'-r', |
|||
'--resolution', |
|||
type=int, |
|||
default=300, |
|||
help='Resolution of the image that was OCRed', |
|||
) |
|||
parser.add_argument( |
|||
'-i', |
|||
'--image', |
|||
default=None, |
|||
help='Path to the image to be placed above the text', |
|||
) |
|||
parser.add_argument( |
|||
'--interword-spaces', |
|||
action='store_true', |
|||
default=False, |
|||
help='Add spaces between words', |
|||
) |
|||
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') |
|||
parser.add_argument('outputfile', help='Path to the PDF file to be generated') |
|||
args = parser.parse_args() |
|||
|
|||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) |
|||
hocr.to_pdf( |
|||
out_filename=args.outputfile, |
|||
image_filename=args.image, |
|||
show_bounding_boxes=args.boundingboxes, |
|||
interword_spaces=args.interword_spaces, |
|||
) |
|||
|
@ -0,0 +1,23 @@ |
|||
<!DOCTYPE html> |
|||
<html lang="en"> |
|||
<head> |
|||
<meta charset="UTF-8"> |
|||
<meta name="viewport" content="width=device-width, initial-scale=1" /> |
|||
<link |
|||
rel="stylesheet" |
|||
href="{{ url_for('static', filename='styles.css') }}" |
|||
/> |
|||
|
|||
|
|||
<title>{% block title %} {% endblock %} WordMord</title> |
|||
</head> |
|||
<body> |
|||
<!-- <nav class="navbar navbar-expand-md navbar-light"> |
|||
<a href="/">Wordmord</a> |
|||
<a href="/about">About</a> |
|||
</nav> --> |
|||
<div class="content"> |
|||
{% block content %} {% endblock %} |
|||
</div> |
|||
</body> |
|||
</html> |
@ -0,0 +1,121 @@ |
|||
{% extends 'base.html' %} |
|||
|
|||
{% block content %} |
|||
|
|||
<!-- <iframe src="https://pad.vvvvvvaria.org/wordmord.test" height="800" width="800"></iframe> --> |
|||
|
|||
<table> |
|||
<tr> |
|||
<td colspan="2"> |
|||
<button onclick="decode()">provoli / show PDF</button> |
|||
<button id="button1" onclick="transform()">metalucksi PDF / dostort PDF</button> |
|||
<button id="button2" onclick="copy()"> antigrafi keimenou ponikou kodika / copy penal code text</button> |
|||
</td> |
|||
<td> |
|||
<div> |
|||
<span id="title3"> perioxi epikollimenou keimenou / pasted text area</span> |
|||
</div> |
|||
</td> |
|||
</tr> |
|||
<tr> |
|||
<td colspan="2"> |
|||
<!-- <input type="text" id="my_field" value="" wrap="soft"> --> |
|||
<textarea name="text" id="my_field" wrap="soft" placeholder="Paste the URL of the PDF here. Κάνε επικόλληση του λινκ εδώ. WordMord believes that the violence of language is not eradicated by merely deleting/erasing words, but rather by transversing their violent imposition through specific practices that trouble and disrupt grammatical consistency, semantic norms, ‘correct’ pronunciation, ‘proper’ bodily posture. The rupture of linguistic limits suggests the possibility of experiencing language in its materiality. |
|||
|
|||
WordMord poses questions on the relationship between language, technology, trauma and violence. The collective artistic research will evolve through workshops, presentations and artworks. Through collaborations with artists, activists and groups working on feminist coding,WordMord seeks to shape an online rhizomatic space as an active feminist archive. At the same time, it project will provide tools and methods towards a poetically subversive meta/para/re-writing of derogatory narratives and consequently of trauma and violence. |
|||
|
|||
#WordMord´s initial research group: Vassiliea Stylianidou aka Franck-Lee Alli-Tis, Angeliki Diakrousi, Christina Karagianni, Stylianos Benetos aka Oýto Arognos, Mounologies: Eleni Diamantouli and Anna Delimpasi. It started in collaboration with the #CNMFPP in 2019. |
|||
"></textarea> |
|||
</td> |
|||
<td rowspan="3"> |
|||
<textarea name="empty" id="empty_frame" wrap="soft" placeholder="Paste the copied text here. Κάνε επικόλληση του κειμένου εδώ"></textarea> |
|||
</td> |
|||
</tr> |
|||
<tr> |
|||
<td> |
|||
<span id="title1"> epishmeiomenos poinikos kodikas / annotated penal code</span> |
|||
</td> |
|||
<td> |
|||
<div> |
|||
<span id="title2">metalucktriomeno PDF / distorted PDF</span> |
|||
</div> |
|||
</td> |
|||
|
|||
</tr> |
|||
<tr> |
|||
<td> |
|||
<div id="initialpdf"> |
|||
<iframe id="inputpdf" src="" height="800rem" width="800rem"></iframe> |
|||
</div> |
|||
</td> |
|||
<td> |
|||
<iframe id="showmonster" src="{{url_for('static', filename='pdf/output.pdf')}}" height="800rem" width="800rem"></iframe> |
|||
</td> |
|||
</tr> |
|||
|
|||
|
|||
</table> |
|||
<!-- WordMord believes that the violence of language is not eradicated by merely deleting/erasing words, but rather by transversing their violent imposition through specific practices that trouble and disrupt grammatical consistency, semantic norms, ‘correct’ pronunciation, ‘proper’ bodily posture. The rupture of linguistic limits suggests the possibility of experiencing language in its materiality. |
|||
|
|||
WordMord poses questions on the relationship between language, technology, trauma and violence. The collective artistic research will evolve through workshops, presentations and artworks. Through collaborations with artists, activists and groups working on feminist coding,WordMord seeks to shape an online rhizomatic space as an active feminist archive. At the same time, it project will provide tools and methods towards a poetically subversive meta/para/re-writing of derogatory narratives and consequently of trauma and violence. |
|||
|
|||
#WordMord´s initial research group: Vassiliea Stylianidou aka Franck-Lee Alli-Tis, Angeliki Diakrousi, Christina Karagianni, Stylianos Benetos aka Oýto Arognos, Mounologies: Eleni Diamantouli and Anna Delimpasi. It started in collaboration with the #CNMFPP in 2019. |
|||
|
|||
Tags --> |
|||
|
|||
|
|||
|
|||
<script> |
|||
function decode() { |
|||
var url = document.getElementById("my_field").value; |
|||
var urlDecode = decodeURIComponent(url); |
|||
// console.log(urlDecode); |
|||
|
|||
document.getElementById("my_field").value = urlDecode; |
|||
const showpdf = document.getElementById("inputpdf"); |
|||
var textarea = document.getElementById('my_field'); |
|||
var result |
|||
|
|||
function updateResult() { |
|||
result.textContent = textarea.value; |
|||
} |
|||
|
|||
textarea.addEventListener('keyup', updateResult); |
|||
showpdf.src = textarea.value; |
|||
const pdf = document.getElementById("initialpdf"); |
|||
pdf.style.visibility = "visible"; |
|||
const button1 = document.getElementById("button1"); |
|||
button1.style.visibility = "visible"; |
|||
const button2 = document.getElementById("button2"); |
|||
button2.style.visibility = "visible"; |
|||
const title1 = document.getElementById("title1"); |
|||
title1.style.visibility = "visible"; |
|||
|
|||
|
|||
// textarea.style.color="#fd69B3"; |
|||
|
|||
} |
|||
|
|||
function transform() { |
|||
const show = document.getElementById("showmonster"); |
|||
show.style.visibility = "visible"; |
|||
const title2 = document.getElementById("title2"); |
|||
title2.style.visibility = "visible"; |
|||
} |
|||
|
|||
function copy() { |
|||
const show = document.getElementById("empty_frame"); |
|||
show.style.visibility = "visible"; |
|||
const title3 = document.getElementById("title3"); |
|||
title3.style.visibility = "visible"; |
|||
|
|||
} |
|||
|
|||
|
|||
$(document).ready(function() { |
|||
$('form').find("input[type='text'],textarea").val(""); |
|||
}); |
|||
// extract the text of a pdf/ copy on clipboard |
|||
</script> |
|||
|
|||
{% endblock %} |
Loading…
Reference in new issue