commit 4f016b718b467e5d877c9dc0d7586c8147d05e34 Author: nglk Date: Mon Mar 21 11:50:41 2022 +0100 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fdd9d63 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +venv/ +static/pdf/ diff --git a/__pycache__/app.cpython-37.pyc b/__pycache__/app.cpython-37.pyc new file mode 100644 index 0000000..2439b5f Binary files /dev/null and b/__pycache__/app.cpython-37.pyc differ diff --git a/__pycache__/hocrtransformpdf.cpython-37.pyc b/__pycache__/hocrtransformpdf.cpython-37.pyc new file mode 100644 index 0000000..bfe6abe Binary files /dev/null and b/__pycache__/hocrtransformpdf.cpython-37.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000..9357173 --- /dev/null +++ b/app.py @@ -0,0 +1,79 @@ +import os +import random +import shutil +import string +import subprocess +from pathlib import Path +from flask import Flask, flash, redirect, render_template, request, url_for +from hocrtransformpdf import * +from werkzeug.utils import secure_filename +from flask_basicauth import BasicAuth +import pdftotree + +UPLOAD_FOLDER = 'static/uploads' +ALLOWED_EXTENSIONS = {'pdf'} + +app = Flask(__name__) + +app.config['BASIC_AUTH_USERNAME'] = 'wordmord' +app.config['BASIC_AUTH_PASSWORD'] = 'tentacles' + +basic_auth = BasicAuth(app) + +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + + +@app.route('/', methods=['GET', 'POST']) +@basic_auth.required +def run_script(): + # the code below was made in case I was using a button upload but now I use the field input so this has to be uploaded and then transformed + if request.method == 'POST': + # check if the post request has the file part + if 'file' not in request.files: + flash('No file part') + return redirect(request.url) + file = request.files['file'] + # if user does not select file, browser also + # submit an empty part without filename + if file.filename == '': + flash('No selected file') + return redirect(request.url) + if file and allowed_file(file.filename): + filename = secure_filename(file.filename) + uploadfilepath=os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(uploadfilepath) + # return redirect(url_for('uploaded_file', + # filename=filename)) + + hocr_result = pdftotree.parse(uploadfilepath) + app.logger.info("test") + hocr = HocrTransform(hocr_filename=hocr_result, dpi=300) + hocr.to_pdf( + out_filename='static/pdf/output-2.pdf', + image_filename='static/images/blank.png', + show_bounding_boxes=False, + interword_spaces=False, + ) + + hocrfile='static/hocr/gynaikoktonia.hocr' + #hocr = HocrTransform(hocr_filename=hocrfile, dpi=300) + #hocr = HocrTransform(hocr_filename=hocr_result, dpi=300) + #hocr.to_pdf( + # out_filename='static/pdf/output.pdf', + # image_filename='static/images/blank.png', + # show_bounding_boxes=False, + # interword_spaces=False, + #) + # result = subprocess.check_output("python3 hocrtransformpdf.py -i images/blank.png hocr/gynaikoktonia.hocr pdf/gynaikoktonia.pdf", shell=True) + return render_template('results.html', **locals()) + + + +def allowed_file(filename): + return '.' in filename and \ + filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + + +if __name__ == "__main__": + app.run() + diff --git a/hocrtransform-visible-pdf.py b/hocrtransform-visible-pdf.py new file mode 100755 index 0000000..03ac255 --- /dev/null +++ b/hocrtransform-visible-pdf.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2010, Jonathan Brinley +# Original version from: https://github.com/jbrinley/HocrConverter +# +# Copyright (c) 2013-14, Julien Pfefferkorn +# Modifications +# +# Copyright (c) 2015-16, James R. Barlow +# Set text to transparent + +# Copyright (c) 2022, WordMord & Alex Roidl +# Set text back to visible and change bounding boxes +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import os +import re +from itertools import chain +from math import atan, cos, sin +from pathlib import Path +from typing import Any, NamedTuple, Optional, Tuple, Union +from xml.etree import ElementTree + +from reportlab.lib.colors import black, cyan, magenta, red +from reportlab.lib.units import inch +from reportlab.pdfgen.canvas import Canvas +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase.pdfmetrics import registerFontFamily + + + +# According to Wikipedia these languages are supported in the ISO-8859-1 character +# set, meaning reportlab can generate them and they are compatible with hocr, +# assuming Tesseract has the necessary languages installed. Note that there may +# not be language packs for them. +HOCR_OK_LANGS = frozenset( + [ + # Languages fully covered by Latin-1: + 'afr', # Afrikaans + 'alb', # Albanian + 'ast', # Leonese + 'baq', # Basque + 'bre', # Breton + 'cos', # Corsican + 'eng', # English + 'eus', # Basque + 'fao', # Faoese + 'gla', # Scottish Gaelic + 'glg', # Galician + 'glv', # Manx + 'ice', # Icelandic + 'ind', # Indonesian + 'isl', # Icelandic + 'ita', # Italian + 'ltz', # Luxembourgish + 'mal', # Malay Rumi + 'mga', # Irish + 'nor', # Norwegian + 'oci', # Occitan + 'por', # Portugeuse + 'roh', # Romansh + 'sco', # Scots + 'sma', # Sami + 'spa', # Spanish + 'sqi', # Albanian + 'swa', # Swahili + 'swe', # Swedish + 'tgl', # Tagalog + 'wln', # Walloon + # Languages supported by Latin-1 except for a few rare characters that OCR + # is probably not trained to recognize anyway: + 'cat', # Catalan + 'cym', # Welsh + 'dan', # Danish + 'deu', # German + 'dut', # Dutch + 'est', # Estonian + 'fin', # Finnish + 'fra', # French + 'hun', # Hungarian + 'kur', # Kurdish + 'nld', # Dutch + 'wel', # Welsh + ] +) + + +Element = ElementTree.Element + + +class Rect(NamedTuple): # pylint: disable=inherit-non-class + """A rectangle for managing PDF coordinates.""" + + x1: Any + y1: Any + x2: Any + y2: Any + + +class HocrTransformError(Exception): + pass + + +class HocrTransform: + + """ + A class for converting documents from the hOCR format. + For details of the hOCR format, see: + http://kba.cloud/hocr-spec/ + """ + + box_pattern = re.compile(r'bbox((\s+\d+){4})') + baseline_pattern = re.compile( + r''' + baseline \s+ + ([\-\+]?\d*\.?\d*) \s+ # +/- decimal float + ([\-\+]?\d+) # +/- int''', + re.VERBOSE, + ) + ligatures = str.maketrans( + {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'} + ) + + def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): + self.dpi = dpi + self.hocr = ElementTree.parse(os.fspath(hocr_filename)) + + # if the hOCR file has a namespace, ElementTree requires its use to + # find elements + matches = re.match(r'({.*})html', self.hocr.getroot().tag) + self.xmlns = '' + if matches: + self.xmlns = matches.group(1) + + # get dimension in pt (not pixel!!!!) of the OCRed image + self.width, self.height = None, None + for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): + coords = self.element_coordinates(div) + pt_coords = self.pt_from_pixel(coords) + self.width = pt_coords.x2 - pt_coords.x1 + self.height = pt_coords.y2 - pt_coords.y1 + # there shouldn't be more than one, and if there is, we don't want + # it + break + if self.width is None or self.height is None: + raise HocrTransformError("hocr file is missing page dimensions") + + def __str__(self): # pragma: no cover + """ + Return the textual content of the HTML body + """ + if self.hocr is None: + return '' + body = self.hocr.find(self._child_xpath('body')) + if body: + return self._get_element_text(body) + else: + return '' + + def _get_element_text(self, element: Element): + """ + Return the textual content of the element and its children + """ + text = '' + if element.text is not None: + text += element.text + for child in element: + text += self._get_element_text(child) + if element.tail is not None: + text += element.tail + return text + + @classmethod + def element_coordinates(cls, element: Element) -> Rect: + """ + Returns a tuple containing the coordinates of the bounding box around + an element + """ + out = Rect._make(0 for _ in range(4)) + if 'title' in element.attrib: + matches = cls.box_pattern.search(element.attrib['title']) + if matches: + coords = matches.group(1).split() + out = Rect._make(int(coords[n]) for n in range(4)) + return out + + @classmethod + def baseline(cls, element: Element) -> Tuple[float, float]: + """ + Returns a tuple containing the baseline slope and intercept. + """ + if 'title' in element.attrib: + matches = cls.baseline_pattern.search(element.attrib['title']) + if matches: + return float(matches.group(1)), int(matches.group(2)) + return (0.0, 0.0) + + def pt_from_pixel(self, pxl) -> Rect: + """ + Returns the quantity in PDF units (pt) given quantity in pixels + """ + return Rect._make((c / self.dpi * inch) for c in pxl) + + def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: + xpath = f".//{self.xmlns}{html_tag}" + if html_class: + xpath += f"[@class='{html_class}']" + return xpath + + @classmethod + def replace_unsupported_chars(cls, s: str) -> str: + """ + Given an input string, returns the corresponding string that: + * is available in the Helvetica facetype + * does not contain any ligature (to allow easy search in the PDF file) + """ + return s.translate(cls.ligatures) + + def topdown_position(self, element): + pxl_line_coords = self.element_coordinates(element) + line_box = self.pt_from_pixel(pxl_line_coords) + # Coordinates here are still in the hocr coordinate system, so 0 on the y axis + # is the top of the page and increasing values of y will move towards the + # bottom of the page. + return line_box.y2 + + def to_pdf( + self, + *, + out_filename: Path, + image_filename: Optional[Path] = None, + show_bounding_boxes: bool = False, + fontname: str = "Helvetica", + invisible_text: bool = False, + interword_spaces: bool = False, + ) -> None: + """ + Creates a PDF file with an image superimposed on top of the text. + Text is positioned according to the bounding box of the lines in + the hOCR file. + The image need not be identical to the image used to create the hOCR + file. + It can have a lower resolution, different color mode, etc. + + Arguments: + out_filename: Path of PDF to write. + image_filename: Image to use for this file. If omitted, the OCR text + is shown. + show_bounding_boxes: Show bounding boxes around various text regions, + for debugging. + fontname: Name of font to use. + invisible_text: If True, text is rendered invisible so that is + selectable but never drawn. If False, text is visible and may + be seen if the image is skipped or deleted in Acrobat. + interword_spaces: If True, insert spaces between words rather than + drawing each word without spaces. Generally this improves text + extraction. + """ + # create the PDF file + # page size in points (1/72 in.) + + pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) + pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) + registerFontFamily('Greek', normal='Greek', bold='GreekB') + + pdf = Canvas( + os.fspath(out_filename), + pagesize=(self.width, self.height), + pageCompression=1, + ) + + if image_filename is not None: + pdf.drawImage( + os.fspath(image_filename), 0, 0, width=self.width, height=self.height + ) + + # draw bounding box for each paragraph + # light blue for bounding box of paragraph + pdf.setStrokeColor(black) + # light blue for bounding box of paragraph + pdf.setFillColor(black) + pdf.setLineWidth(1) # no line for bounding box + for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): + elemtxt = self._get_element_text(elem).rstrip() + if len(elemtxt) == 0: + continue + + pxl_coords = self.element_coordinates(elem) + pt = self.pt_from_pixel(pxl_coords) + + # draw the bbox border + if show_bounding_boxes: # pragma: no cover + pdf.rect( + pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 + ) + + found_lines = False + for line in sorted( + chain( + self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), + ), + key=self.topdown_position, + ): + found_lines = True + self._do_line( + pdf, + line, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + + if not found_lines: + # Tesseract did not report any lines (just words) + root = self.hocr.find(self._child_xpath('div', 'ocr_page')) + self._do_line( + pdf, + root, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + # put the image on the page, scaled to fill the page + + + # finish up the page and save it + pdf.showPage() + pdf.save() + + @classmethod + def polyval(cls, poly, x): # pragma: no cover + return x * poly[0] + poly[1] + + def _do_line( + self, + pdf: Canvas, + line: Optional[Element], + elemclass: str, + fontname: str, + invisible_text: bool, + interword_spaces: bool, + show_bounding_boxes: bool, + ): + if not line: + return + pxl_line_coords = self.element_coordinates(line) + line_box = self.pt_from_pixel(pxl_line_coords) + line_height = line_box.y2 - line_box.y1 + + slope, pxl_intercept = self.baseline(line) + if abs(slope) < 0.005: + slope = 0.0 + angle = atan(slope) + cos_a, sin_a = cos(angle), sin(angle) + + text = pdf.beginText() + intercept = pxl_intercept / self.dpi * inch + + # Don't allow the font to break out of the bounding box. Division by + # cos_a accounts for extra clearance between the glyph's vertical axis + # on a sloped baseline and the edge of the bounding box. + fontsize = (line_height - abs(intercept)) / cos_a * 1.2 + #fontsize = 10.5 + text.setFont('Greek', fontsize) + #if invisible_text: + # text.setTextRenderMode(3) # Invisible (indicates OCR text) + + # Intercept is normally negative, so this places it above the bottom + # of the line box + baseline_y2 = self.height - (line_box.y2 + intercept) + + if False: # pragma: no cover + # draw the baseline in magenta, dashed + pdf.setDash() + pdf.setStrokeColor(magenta) + pdf.setLineWidth(0.5) + # negate slope because it is defined as a rise/run in pixel + # coordinates and page coordinates have the y axis flipped + pdf.line( + line_box.x1, + baseline_y2, + line_box.x2, + self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), + ) + # light green for bounding box of word/line + pdf.setDash(6, 3) + pdf.setStrokeColor(red) + + #text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) + text.setTextOrigin(line_box.x1, baseline_y2) + ##pdf.translate(line_box.x1, baseline_y2) + pdf.setFillColor(black) # text in black + + elements = line.findall(self._child_xpath('span', elemclass)) + for elem in elements: + elemtxt = self._get_element_text(elem).strip() + elemtxt = self.replace_unsupported_chars(elemtxt) + if elemtxt == '': + continue + + pxl_coords = self.element_coordinates(elem) + box = self.pt_from_pixel(pxl_coords) + if False: + # if `--interword-spaces` is true, append a space + # to the end of each text element to allow simpler PDF viewers + # such as PDF.js to better recognize words in search and copy + # and paste. Do not remove space from last word in line, even + # though it would look better, because it will interfere with + # naive text extraction. \n does not work either. + elemtxt += ' ' + box = Rect._make( + ( + box.x1, + line_box.y1, + box.x2 + pdf.stringWidth(' ', fontname, line_height), + line_box.y2, + ) + ) + box_width = box.x2 - box.x1 + font_width = pdf.stringWidth(elemtxt, fontname, fontsize) + + # draw the bbox border + if False: # pragma: no cover + pdf.rect( + box.x1, self.height - line_box.y2, box_width, line_height, fill=0 + ) + + # Adjust relative position of cursor + # This is equivalent to: + # text.setTextOrigin(pt.x1, self.height - line_box.y2) + # but the former generates a full text reposition matrix (Tm) in the + # content stream while this issues a "offset" (Td) command. + # .moveCursor() is relative to start of the text line, where the + # "text line" means whatever reportlab defines it as. Do not use + # use .getCursor(), since moveCursor() rather unintuitively plans + # its moves relative to .getStartOfLine(). + # For skewed lines, in the text transform we set up a rotated + # coordinate system, so we don't have to account for the + # incremental offset. Surprisingly most PDF viewers can handle this. + cursor = text.getStartOfLine() + dx = box.x1 - cursor[0] + dy = baseline_y2 - cursor[1] + text.moveCursor(dx, dy) + + # If reportlab tells us this word is 0 units wide, our best seems + # to be to suppress this text + if font_width > 0: + #text.setHorizScale(100 * box_width / font_width) + text.textOut(elemtxt) + pdf.drawText(text) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert hocr file to PDF') + parser.add_argument( + '-b', + '--boundingboxes', + action="store_true", + default=False, + help='Show bounding boxes borders', + ) + parser.add_argument( + '-r', + '--resolution', + type=int, + default=300, + help='Resolution of the image that was OCRed', + ) + parser.add_argument( + '-i', + '--image', + default=None, + help='Path to the image to be placed above the text', + ) + parser.add_argument( + '--interword-spaces', + action='store_true', + default=False, + help='Add spaces between words', + ) + parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') + parser.add_argument('outputfile', help='Path to the PDF file to be generated') + args = parser.parse_args() + + hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) + hocr.to_pdf( + out_filename=args.outputfile, + image_filename=args.image, + show_bounding_boxes=args.boundingboxes, + interword_spaces=args.interword_spaces, + ) + diff --git a/hocrtransform.py b/hocrtransform.py new file mode 100755 index 0000000..b4037e0 --- /dev/null +++ b/hocrtransform.py @@ -0,0 +1,511 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2010, Jonathan Brinley +# Original version from: https://github.com/jbrinley/HocrConverter +# +# Copyright (c) 2013-14, Julien Pfefferkorn +# Modifications +# +# Copyright (c) 2015-16, James R. Barlow +# Set text to transparent +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import os +import re +from itertools import chain +from math import atan, cos, sin +from pathlib import Path +from typing import Any, NamedTuple, Optional, Tuple, Union +from xml.etree import ElementTree + +from reportlab.lib.colors import black, cyan, magenta, red +from reportlab.lib.units import inch +from reportlab.pdfgen.canvas import Canvas +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase.pdfmetrics import registerFontFamily + + + +# According to Wikipedia these languages are supported in the ISO-8859-1 character +# set, meaning reportlab can generate them and they are compatible with hocr, +# assuming Tesseract has the necessary languages installed. Note that there may +# not be language packs for them. +HOCR_OK_LANGS = frozenset( + [ + # Languages fully covered by Latin-1: + 'afr', # Afrikaans + 'alb', # Albanian + 'ast', # Leonese + 'baq', # Basque + 'bre', # Breton + 'cos', # Corsican + 'eng', # English + 'eus', # Basque + 'fao', # Faoese + 'gla', # Scottish Gaelic + 'glg', # Galician + 'glv', # Manx + 'ice', # Icelandic + 'ind', # Indonesian + 'isl', # Icelandic + 'ita', # Italian + 'ltz', # Luxembourgish + 'mal', # Malay Rumi + 'mga', # Irish + 'nor', # Norwegian + 'oci', # Occitan + 'por', # Portugeuse + 'roh', # Romansh + 'sco', # Scots + 'sma', # Sami + 'spa', # Spanish + 'sqi', # Albanian + 'swa', # Swahili + 'swe', # Swedish + 'tgl', # Tagalog + 'wln', # Walloon + # Languages supported by Latin-1 except for a few rare characters that OCR + # is probably not trained to recognize anyway: + 'cat', # Catalan + 'cym', # Welsh + 'dan', # Danish + 'deu', # German + 'dut', # Dutch + 'est', # Estonian + 'fin', # Finnish + 'fra', # French + 'hun', # Hungarian + 'kur', # Kurdish + 'nld', # Dutch + 'wel', # Welsh + ] +) + + +Element = ElementTree.Element + + +class Rect(NamedTuple): # pylint: disable=inherit-non-class + """A rectangle for managing PDF coordinates.""" + + x1: Any + y1: Any + x2: Any + y2: Any + + +class HocrTransformError(Exception): + pass + + +class HocrTransform: + + """ + A class for converting documents from the hOCR format. + For details of the hOCR format, see: + http://kba.cloud/hocr-spec/ + """ + + box_pattern = re.compile(r'bbox((\s+\d+){4})') + baseline_pattern = re.compile( + r''' + baseline \s+ + ([\-\+]?\d*\.?\d*) \s+ # +/- decimal float + ([\-\+]?\d+) # +/- int''', + re.VERBOSE, + ) + ligatures = str.maketrans( + {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'} + ) + + def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): + self.dpi = dpi + self.hocr = ElementTree.parse(os.fspath(hocr_filename)) + + # if the hOCR file has a namespace, ElementTree requires its use to + # find elements + matches = re.match(r'({.*})html', self.hocr.getroot().tag) + self.xmlns = '' + if matches: + self.xmlns = matches.group(1) + + # get dimension in pt (not pixel!!!!) of the OCRed image + self.width, self.height = None, None + for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): + coords = self.element_coordinates(div) + pt_coords = self.pt_from_pixel(coords) + self.width = pt_coords.x2 - pt_coords.x1 + self.height = pt_coords.y2 - pt_coords.y1 + # there shouldn't be more than one, and if there is, we don't want + # it + break + if self.width is None or self.height is None: + raise HocrTransformError("hocr file is missing page dimensions") + + def __str__(self): # pragma: no cover + """ + Return the textual content of the HTML body + """ + if self.hocr is None: + return '' + body = self.hocr.find(self._child_xpath('body')) + if body: + return self._get_element_text(body) + else: + return '' + + def _get_element_text(self, element: Element): + """ + Return the textual content of the element and its children + """ + text = '' + if element.text is not None: + text += element.text + for child in element: + text += self._get_element_text(child) + if element.tail is not None: + text += element.tail + return text + + @classmethod + def element_coordinates(cls, element: Element) -> Rect: + """ + Returns a tuple containing the coordinates of the bounding box around + an element + """ + out = Rect._make(0 for _ in range(4)) + if 'title' in element.attrib: + matches = cls.box_pattern.search(element.attrib['title']) + if matches: + coords = matches.group(1).split() + out = Rect._make(int(coords[n]) for n in range(4)) + return out + + @classmethod + def baseline(cls, element: Element) -> Tuple[float, float]: + """ + Returns a tuple containing the baseline slope and intercept. + """ + if 'title' in element.attrib: + matches = cls.baseline_pattern.search(element.attrib['title']) + if matches: + return float(matches.group(1)), int(matches.group(2)) + return (0.0, 0.0) + + def pt_from_pixel(self, pxl) -> Rect: + """ + Returns the quantity in PDF units (pt) given quantity in pixels + """ + return Rect._make((c / self.dpi * inch) for c in pxl) + + def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: + xpath = f".//{self.xmlns}{html_tag}" + if html_class: + xpath += f"[@class='{html_class}']" + return xpath + + @classmethod + def replace_unsupported_chars(cls, s: str) -> str: + """ + Given an input string, returns the corresponding string that: + * is available in the Helvetica facetype + * does not contain any ligature (to allow easy search in the PDF file) + """ + return s.translate(cls.ligatures) + + def topdown_position(self, element): + pxl_line_coords = self.element_coordinates(element) + line_box = self.pt_from_pixel(pxl_line_coords) + # Coordinates here are still in the hocr coordinate system, so 0 on the y axis + # is the top of the page and increasing values of y will move towards the + # bottom of the page. + return line_box.y2 + + def to_pdf( + self, + *, + out_filename: Path, + image_filename: Optional[Path] = None, + show_bounding_boxes: bool = False, + fontname: str = "Helvetica", + invisible_text: bool = False, + interword_spaces: bool = False, + ) -> None: + """ + Creates a PDF file with an image superimposed on top of the text. + Text is positioned according to the bounding box of the lines in + the hOCR file. + The image need not be identical to the image used to create the hOCR + file. + It can have a lower resolution, different color mode, etc. + + Arguments: + out_filename: Path of PDF to write. + image_filename: Image to use for this file. If omitted, the OCR text + is shown. + show_bounding_boxes: Show bounding boxes around various text regions, + for debugging. + fontname: Name of font to use. + invisible_text: If True, text is rendered invisible so that is + selectable but never drawn. If False, text is visible and may + be seen if the image is skipped or deleted in Acrobat. + interword_spaces: If True, insert spaces between words rather than + drawing each word without spaces. Generally this improves text + extraction. + """ + # create the PDF file + # page size in points (1/72 in.) + pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) + pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) + registerFontFamily('Greek', normal='Greek', bold='GreekB') + + pdf = Canvas( + os.fspath(out_filename), + pagesize=(self.width, self.height), + pageCompression=1, + ) + + # draw bounding box for each paragraph + # light blue for bounding box of paragraph + pdf.setStrokeColor(cyan) + # light blue for bounding box of paragraph + pdf.setFillColor(cyan) + pdf.setLineWidth(0) # no line for bounding box + for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): + elemtxt = self._get_element_text(elem).rstrip() + if len(elemtxt) == 0: + continue + + pxl_coords = self.element_coordinates(elem) + pt = self.pt_from_pixel(pxl_coords) + + # draw the bbox border + if show_bounding_boxes: # pragma: no cover + pdf.rect( + pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 + ) + + found_lines = False + for line in sorted( + chain( + self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), + ), + key=self.topdown_position, + ): + found_lines = True + self._do_line( + pdf, + line, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + + if not found_lines: + # Tesseract did not report any lines (just words) + root = self.hocr.find(self._child_xpath('div', 'ocr_page')) + self._do_line( + pdf, + root, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + # put the image on the page, scaled to fill the page + if image_filename is not None: + pdf.drawImage( + os.fspath(image_filename), 0, 0, width=self.width, height=self.height + ) + + # finish up the page and save it + pdf.showPage() + pdf.save() + + @classmethod + def polyval(cls, poly, x): # pragma: no cover + return x * poly[0] + poly[1] + + def _do_line( + self, + pdf: Canvas, + line: Optional[Element], + elemclass: str, + fontname: str, + invisible_text: bool, + interword_spaces: bool, + show_bounding_boxes: bool, + ): + if not line: + return + pxl_line_coords = self.element_coordinates(line) + line_box = self.pt_from_pixel(pxl_line_coords) + line_height = line_box.y2 - line_box.y1 + + slope, pxl_intercept = self.baseline(line) + if abs(slope) < 0.005: + slope = 0.0 + angle = atan(slope) + cos_a, sin_a = cos(angle), sin(angle) + + text = pdf.beginText() + intercept = pxl_intercept / self.dpi * inch + + # Don't allow the font to break out of the bounding box. Division by + # cos_a accounts for extra clearance between the glyph's vertical axis + # on a sloped baseline and the edge of the bounding box. + fontsize = (line_height - abs(intercept)) / cos_a * 1.2 + #fontsize = 10.5 + text.setFont('Greek', fontsize) + if invisible_text: + text.setTextRenderMode(3) # Invisible (indicates OCR text) + + # Intercept is normally negative, so this places it above the bottom + # of the line box + baseline_y2 = self.height - (line_box.y2 + intercept) + + if False: # pragma: no cover + # draw the baseline in magenta, dashed + pdf.setDash() + pdf.setStrokeColor(magenta) + pdf.setLineWidth(0.5) + # negate slope because it is defined as a rise/run in pixel + # coordinates and page coordinates have the y axis flipped + pdf.line( + line_box.x1, + baseline_y2, + line_box.x2, + self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), + ) + # light green for bounding box of word/line + pdf.setDash(6, 3) + pdf.setStrokeColor(red) + + #text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) + text.setTextOrigin(line_box.x1, baseline_y2) + pdf.setFillColor(black) # text in black + + elements = line.findall(self._child_xpath('span', elemclass)) + for elem in elements: + elemtxt = self._get_element_text(elem).strip() + elemtxt = self.replace_unsupported_chars(elemtxt) + if elemtxt == '': + continue + + pxl_coords = self.element_coordinates(elem) + box = self.pt_from_pixel(pxl_coords) + if interword_spaces: + # if `--interword-spaces` is true, append a space + # to the end of each text element to allow simpler PDF viewers + # such as PDF.js to better recognize words in search and copy + # and paste. Do not remove space from last word in line, even + # though it would look better, because it will interfere with + # naive text extraction. \n does not work either. + elemtxt += ' ' + box = Rect._make( + ( + box.x1, + line_box.y1, + box.x2 + pdf.stringWidth(' ', fontname, line_height), + line_box.y2, + ) + ) + box_width = box.x2 - box.x1 + font_width = pdf.stringWidth(elemtxt, fontname, fontsize) + + # draw the bbox border + if False: # pragma: no cover + pdf.rect( + box.x1, self.height - line_box.y2, box_width, line_height, fill=0 + ) + + # Adjust relative position of cursor + # This is equivalent to: + # text.setTextOrigin(pt.x1, self.height - line_box.y2) + # but the former generates a full text reposition matrix (Tm) in the + # content stream while this issues a "offset" (Td) command. + # .moveCursor() is relative to start of the text line, where the + # "text line" means whatever reportlab defines it as. Do not use + # use .getCursor(), since moveCursor() rather unintuitively plans + # its moves relative to .getStartOfLine(). + # For skewed lines, in the text transform we set up a rotated + # coordinate system, so we don't have to account for the + # incremental offset. Surprisingly most PDF viewers can handle this. + cursor = text.getStartOfLine() + dx = box.x1 - cursor[0] + dy = baseline_y2 - cursor[1] + text.moveCursor(dx, dy) + + # If reportlab tells us this word is 0 units wide, our best seems + # to be to suppress this text + if font_width > 0: + #text.setHorizScale(100 * box_width / font_width) + text.textOut(elemtxt) + pdf.drawText(text) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert hocr file to PDF') + parser.add_argument( + '-b', + '--boundingboxes', + action="store_true", + default=False, + help='Show bounding boxes borders', + ) + parser.add_argument( + '-r', + '--resolution', + type=int, + default=300, + help='Resolution of the image that was OCRed', + ) + parser.add_argument( + '-i', + '--image', + default=None, + help='Path to the image to be placed above the text', + ) + parser.add_argument( + '--interword-spaces', + action='store_true', + default=False, + help='Add spaces between words', + ) + parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') + parser.add_argument('outputfile', help='Path to the PDF file to be generated') + args = parser.parse_args() + + hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) + hocr.to_pdf( + out_filename=args.outputfile, + image_filename=args.image, + show_bounding_boxes=args.boundingboxes, + interword_spaces=args.interword_spaces, + ) + diff --git a/hocrtransformpdf.py b/hocrtransformpdf.py new file mode 100755 index 0000000..17ef814 --- /dev/null +++ b/hocrtransformpdf.py @@ -0,0 +1,518 @@ +#!venv/bin python3 +# +# Copyright (c) 2010, Jonathan Brinley +# Original version from: https://github.com/jbrinley/HocrConverter +# +# Copyright (c) 2013-14, Julien Pfefferkorn +# Modifications +# +# Copyright (c) 2015-16, James R. Barlow +# Set text to transparent + +# Copyright (c) 2022, WordMord & Alex Roidl +# Set text back to visible and change bounding boxes +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import argparse +import os +import re +from itertools import chain +from math import atan, cos, sin +from pathlib import Path +from typing import Any, NamedTuple, Optional, Tuple, Union +from xml.etree import ElementTree + +from reportlab.lib.colors import black, cyan, magenta, red +from reportlab.lib.units import inch +from reportlab.pdfgen.canvas import Canvas +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase.pdfmetrics import registerFontFamily + + + +# According to Wikipedia these languages are supported in the ISO-8859-1 character +# set, meaning reportlab can generate them and they are compatible with hocr, +# assuming Tesseract has the necessary languages installed. Note that there may +# not be language packs for them. +HOCR_OK_LANGS = frozenset( + [ + # Languages fully covered by Latin-1: + 'afr', # Afrikaans + 'alb', # Albanian + 'ast', # Leonese + 'baq', # Basque + 'bre', # Breton + 'cos', # Corsican + 'eng', # English + 'eus', # Basque + 'fao', # Faoese + 'gla', # Scottish Gaelic + 'glg', # Galician + 'glv', # Manx + 'ice', # Icelandic + 'ind', # Indonesian + 'isl', # Icelandic + 'ita', # Italian + 'ltz', # Luxembourgish + 'mal', # Malay Rumi + 'mga', # Irish + 'nor', # Norwegian + 'oci', # Occitan + 'por', # Portugeuse + 'roh', # Romansh + 'sco', # Scots + 'sma', # Sami + 'spa', # Spanish + 'sqi', # Albanian + 'swa', # Swahili + 'swe', # Swedish + 'tgl', # Tagalog + 'wln', # Walloon + # Languages supported by Latin-1 except for a few rare characters that OCR + # is probably not trained to recognize anyway: + 'cat', # Catalan + 'cym', # Welsh + 'dan', # Danish + 'deu', # German + 'dut', # Dutch + 'est', # Estonian + 'fin', # Finnish + 'fra', # French + 'hun', # Hungarian + 'kur', # Kurdish + 'nld', # Dutch + 'wel', # Welsh + ] +) + + +Element = ElementTree.Element + + +class Rect(NamedTuple): # pylint: disable=inherit-non-class + """A rectangle for managing PDF coordinates.""" + + x1: Any + y1: Any + x2: Any + y2: Any + + +class HocrTransformError(Exception): + pass + + +class HocrTransform: + + """ + A class for converting documents from the hOCR format. + For details of the hOCR format, see: + http://kba.cloud/hocr-spec/ + """ + + box_pattern = re.compile(r'bbox((\s+\d+){4})') + baseline_pattern = re.compile( + r''' + baseline \s+ + ([\-\+]?\d*\.?\d*) \s+ # +/- decimal float + ([\-\+]?\d+) # +/- int''', + re.VERBOSE, + ) + ligatures = str.maketrans( + {'ff': 'ff', 'ffi': 'f‌f‌i', 'ffl': 'f‌f‌l', 'fi': 'fi', 'fl': 'fl'} + ) + + def __init__(self, *, hocr_filename: Union[str, Path], dpi: float): + self.dpi = dpi + self.hocr = ElementTree.parse(os.fspath(hocr_filename)) + + # if the hOCR file has a namespace, ElementTree requires its use to + # find elements + matches = re.match(r'({.*})html', self.hocr.getroot().tag) + self.xmlns = '' + if matches: + self.xmlns = matches.group(1) + + # get dimension in pt (not pixel!!!!) of the OCRed image + self.width, self.height = None, None + for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')): + coords = self.element_coordinates(div) + pt_coords = self.pt_from_pixel(coords) + self.width = pt_coords.x2 - pt_coords.x1 + self.height = pt_coords.y2 - pt_coords.y1 + # there shouldn't be more than one, and if there is, we don't want + # it + break + if self.width is None or self.height is None: + raise HocrTransformError("hocr file is missing page dimensions") + + def __str__(self): # pragma: no cover + """ + Return the textual content of the HTML body + """ + if self.hocr is None: + return '' + body = self.hocr.find(self._child_xpath('body')) + if body: + return self._get_element_text(body) + else: + return '' + + def _get_element_text(self, element: Element): + """ + Return the textual content of the element and its children + """ + text = '' + if element.text is not None: + text += element.text + for child in element: + text += self._get_element_text(child) + if element.tail is not None: + text += element.tail + return text + + @classmethod + def element_coordinates(cls, element: Element) -> Rect: + """ + Returns a tuple containing the coordinates of the bounding box around + an element + """ + out = Rect._make(0 for _ in range(4)) + if 'title' in element.attrib: + matches = cls.box_pattern.search(element.attrib['title']) + if matches: + coords = matches.group(1).split() + out = Rect._make(int(coords[n]) for n in range(4)) + return out + + @classmethod + def baseline(cls, element: Element) -> Tuple[float, float]: + """ + Returns a tuple containing the baseline slope and intercept. + """ + if 'title' in element.attrib: + matches = cls.baseline_pattern.search(element.attrib['title']) + if matches: + return float(matches.group(1)), int(matches.group(2)) + return (0.0, 0.0) + + def pt_from_pixel(self, pxl) -> Rect: + """ + Returns the quantity in PDF units (pt) given quantity in pixels + """ + return Rect._make((c / self.dpi * inch) for c in pxl) + + def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str: + xpath = f".//{self.xmlns}{html_tag}" + if html_class: + xpath += f"[@class='{html_class}']" + return xpath + + @classmethod + def replace_unsupported_chars(cls, s: str) -> str: + """ + Given an input string, returns the corresponding string that: + * is available in the Helvetica facetype + * does not contain any ligature (to allow easy search in the PDF file) + """ + return s.translate(cls.ligatures) + + def topdown_position(self, element): + pxl_line_coords = self.element_coordinates(element) + line_box = self.pt_from_pixel(pxl_line_coords) + # Coordinates here are still in the hocr coordinate system, so 0 on the y axis + # is the top of the page and increasing values of y will move towards the + # bottom of the page. + return line_box.y2 + + def to_pdf( + self, + *, + out_filename: Path, + image_filename: Optional[Path] = None, + show_bounding_boxes: bool = False, + fontname: str = "Helvetica", + invisible_text: bool = False, + interword_spaces: bool = False, + ) -> None: + """ + Creates a PDF file with an image superimposed on top of the text. + Text is positioned according to the bounding box of the lines in + the hOCR file. + The image need not be identical to the image used to create the hOCR + file. + It can have a lower resolution, different color mode, etc. + + Arguments: + out_filename: Path of PDF to write. + image_filename: Image to use for this file. If omitted, the OCR text + is shown. + show_bounding_boxes: Show bounding boxes around various text regions, + for debugging. + fontname: Name of font to use. + invisible_text: If True, text is rendered invisible so that is + selectable but never drawn. If False, text is visible and may + be seen if the image is skipped or deleted in Acrobat. + interword_spaces: If True, insert spaces between words rather than + drawing each word without spaces. Generally this improves text + extraction. + """ + # create the PDF file + # page size in points (1/72 in.) + + pdfmetrics.registerFont(TTFont('Greek', 'static/fonts/greek.ttf')) + pdfmetrics.registerFont(TTFont('GreekB', 'static/fonts/greek-bold.ttf')) + registerFontFamily('Greek', normal='Greek', bold='GreekB') + + pdf = Canvas( + os.fspath(out_filename), + pagesize=(self.width, self.height), + pageCompression=1, + ) + + if image_filename is not None: + pdf.drawImage( + os.fspath(image_filename), 0, 0, width=self.width, height=self.height + ) + + # draw bounding box for each paragraph + # light blue for bounding box of paragraph + pdf.setStrokeColor(black) + # light blue for bounding box of paragraph + pdf.setFillColor(black) + pdf.setLineWidth(1) # no line for bounding box + for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')): + elemtxt = self._get_element_text(elem).rstrip() + if len(elemtxt) == 0: + continue + + pxl_coords = self.element_coordinates(elem) + pt = self.pt_from_pixel(pxl_coords) + + # draw the bbox border + if show_bounding_boxes: # pragma: no cover + pdf.rect( + pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1 + ) + + found_lines = False + for line in sorted( + chain( + self.hocr.iterfind(self._child_xpath('span', 'ocr_header')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_line')), + self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')), + ), + key=self.topdown_position, + ): + found_lines = True + self._do_line( + pdf, + line, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + + if not found_lines: + # Tesseract did not report any lines (just words) + root = self.hocr.find(self._child_xpath('div', 'ocr_page')) + self._do_line( + pdf, + root, + "ocrx_word", + fontname, + invisible_text, + interword_spaces, + show_bounding_boxes, + ) + # put the image on the page, scaled to fill the page + + + # finish up the page and save it + pdf.showPage() + pdf.save() + + @classmethod + def polyval(cls, poly, x): # pragma: no cover + return x * poly[0] + poly[1] + + def _do_line( + self, + pdf: Canvas, + line: Optional[Element], + elemclass: str, + fontname: str, + invisible_text: bool, + interword_spaces: bool, + show_bounding_boxes: bool, + ): + if not line: + return + pxl_line_coords = self.element_coordinates(line) + line_box = self.pt_from_pixel(pxl_line_coords) + line_height = line_box.y2 - line_box.y1 + + slope, pxl_intercept = self.baseline(line) + if abs(slope) < 0.005: + slope = 0.0 + angle = atan(slope) + cos_a, sin_a = cos(angle), sin(angle) + + text = pdf.beginText() + intercept = pxl_intercept / self.dpi * inch + + # Don't allow the font to break out of the bounding box. Division by + # cos_a accounts for extra clearance between the glyph's vertical axis + # on a sloped baseline and the edge of the bounding box. + fontsize = (line_height - abs(intercept)) / cos_a * 1.2 + #fontsize = 10.5 + text.setFont('Greek', fontsize) + #if invisible_text: + # text.setTextRenderMode(3) # Invisible (indicates OCR text) + + # Intercept is normally negative, so this places it above the bottom + # of the line box + baseline_y2 = self.height - (line_box.y2 + intercept) + + if False: # pragma: no cover + # draw the baseline in magenta, dashed + pdf.setDash() + pdf.setStrokeColor(magenta) + pdf.setLineWidth(0.5) + # negate slope because it is defined as a rise/run in pixel + # coordinates and page coordinates have the y axis flipped + pdf.line( + line_box.x1, + baseline_y2, + line_box.x2, + self.polyval((-slope, baseline_y2), line_box.x2 - line_box.x1), + ) + # light green for bounding box of word/line + pdf.setDash(6, 3) + pdf.setStrokeColor(red) + + #text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2) + text.setTextOrigin(line_box.x1, baseline_y2) + ##pdf.translate(line_box.x1, baseline_y2) + pdf.setFillColor(black) # text in black + + elements = line.findall(self._child_xpath('span', elemclass)) + for elem in elements: + elemtxt = self._get_element_text(elem).strip() + elemtxt = self.replace_unsupported_chars(elemtxt) + if elemtxt == '': + continue + + pxl_coords = self.element_coordinates(elem) + box = self.pt_from_pixel(pxl_coords) + if False: + # if `--interword-spaces` is true, append a space + # to the end of each text element to allow simpler PDF viewers + # such as PDF.js to better recognize words in search and copy + # and paste. Do not remove space from last word in line, even + # though it would look better, because it will interfere with + # naive text extraction. \n does not work either. + elemtxt += ' ' + box = Rect._make( + ( + box.x1, + line_box.y1, + box.x2 + pdf.stringWidth(' ', fontname, line_height), + line_box.y2, + ) + ) + box_width = box.x2 - box.x1 + font_width = pdf.stringWidth(elemtxt, fontname, fontsize) + + # draw the bbox border + if False: # pragma: no cover + pdf.rect( + box.x1, self.height - line_box.y2, box_width, line_height, fill=0 + ) + + # Adjust relative position of cursor + # This is equivalent to: + # text.setTextOrigin(pt.x1, self.height - line_box.y2) + # but the former generates a full text reposition matrix (Tm) in the + # content stream while this issues a "offset" (Td) command. + # .moveCursor() is relative to start of the text line, where the + # "text line" means whatever reportlab defines it as. Do not use + # use .getCursor(), since moveCursor() rather unintuitively plans + # its moves relative to .getStartOfLine(). + # For skewed lines, in the text transform we set up a rotated + # coordinate system, so we don't have to account for the + # incremental offset. Surprisingly most PDF viewers can handle this. + cursor = text.getStartOfLine() + dx = box.x1 - cursor[0] + dy = baseline_y2 - cursor[1] + text.moveCursor(dx, dy) + + # If reportlab tells us this word is 0 units wide, our best seems + # to be to suppress this text + if font_width > 0: + #text.setHorizScale(100 * box_width / font_width) + text.textOut(elemtxt) + pdf.drawText(text) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert hocr file to PDF') + parser.add_argument( + '-b', + '--boundingboxes', + action="store_true", + default=False, + help='Show bounding boxes borders', + ) + parser.add_argument( + '-r', + '--resolution', + type=int, + default=300, + help='Resolution of the image that was OCRed', + ) + parser.add_argument( + '-i', + '--image', + default=None, + help='Path to the image to be placed above the text', + ) + parser.add_argument( + '--interword-spaces', + action='store_true', + default=False, + help='Add spaces between words', + ) + parser.add_argument('hocrfile', help='Path to the hocr file to be parsed') + parser.add_argument('outputfile', help='Path to the PDF file to be generated') + args = parser.parse_args() + + hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution) + hocr.to_pdf( + out_filename=args.outputfile, + image_filename=args.image, + show_bounding_boxes=args.boundingboxes, + interword_spaces=args.interword_spaces, + ) + diff --git a/static/fonts/AC-Poiret.ttf b/static/fonts/AC-Poiret.ttf new file mode 100644 index 0000000..7005cd1 Binary files /dev/null and b/static/fonts/AC-Poiret.ttf differ diff --git a/static/fonts/Compagnon-Roman.otf b/static/fonts/Compagnon-Roman.otf new file mode 100644 index 0000000..e79cc36 Binary files /dev/null and b/static/fonts/Compagnon-Roman.otf differ diff --git a/static/fonts/SolideMirage-Etroit.otf b/static/fonts/SolideMirage-Etroit.otf new file mode 100644 index 0000000..b86b6b5 Binary files /dev/null and b/static/fonts/SolideMirage-Etroit.otf differ diff --git a/static/fonts/greek-bold.ttf b/static/fonts/greek-bold.ttf new file mode 100644 index 0000000..ac0082c Binary files /dev/null and b/static/fonts/greek-bold.ttf differ diff --git a/static/fonts/greek.ttf b/static/fonts/greek.ttf new file mode 100755 index 0000000..1f6340f Binary files /dev/null and b/static/fonts/greek.ttf differ diff --git a/static/fonts/zarathustra-v01.otf b/static/fonts/zarathustra-v01.otf new file mode 100644 index 0000000..e4708f5 Binary files /dev/null and b/static/fonts/zarathustra-v01.otf differ diff --git a/static/hocr/anthropoktonia.hocr b/static/hocr/anthropoktonia.hocr new file mode 100644 index 0000000..e69de29 diff --git a/static/hocr/gynaikoktonia (another copy).hocr b/static/hocr/gynaikoktonia (another copy).hocr new file mode 100644 index 0000000..6cad21c --- /dev/null +++ b/static/hocr/gynaikoktonia (another copy).hocr @@ -0,0 +1,142 @@ + + + + + + + + + + +
+
+

+ ΕΓΚΛΗΜΑΤΑ ΚΑΤΑ ΤΗΣ ΖΩΗΣ ΚΑΙ ΠΡΟΣΒΟΛΕΣ ΤΩΝ ΘΥΛΗΚΟΤΗΤ* + + Ι. Εγκλήματα βλάβης της ζωής της ●●●●●●●● + +

+
+
+

+ Άρθρο 299 + + Γυναικοκτονία με δόλο + +

+
+
+

+ 1. Όποιος σκότωσε άλλη τιμωρείται με κάθειρξη ισόβια ή πρόσκαιρη τουλάχιστον δέκα ετών. + + 2. Αν η πράξη αποφασίστηκε και εκτελέστηκε σε βρασμό ψυχικής ορμής, επιβάλλεται κάθειρξη. + +

+
+
+

+ ●●●●● ●●● + + ●●●●●●●●●●●●● ●●●● ●●●●●●●● + +

+
+
+

+ ●●●●●● ●●●●●●●●● ●●● ●●●●●●●● ●●●●●●●●●●●●● ●●●●●● ●●● ●●●●●●●● ●●● ●●●●●●● ●●●●●●●● + + ●●● ●●●●●●● ●●● ●●● ●●●●● ●●● ●●●●● ●●● ●●●●●● ●●● ●●●●●● ●●●●●●●● ●●●●●●●●●● ●● ●●●●●●●●● + +

+
+
+

+ Άρθρο 301 + + Συμμετοχή σε αυτοκτονία + +

+
+
+

+ Όποιος κατέπεισε άλλην να αυτοκτονήσει, αν τελέστηκε η αυτοκτονία ή έγινε απόπειρά της, + + καθώς και όποιος έδωσε βοήθεια κατά την τέλεσή της, η οποία διαφορετικά δεν θα ήταν εφικτή, + + τιμωρείται με φυλάκιση. + +

+
+
+

+ Άρθρο 302 + + Γυναικοκτονία από θεσμική αμέλεια + +

+
+
+

+ Όποιος από αμέλεια σκότωσε άλλην, τιμωρείται με φυλάκιση τουλάχιστον τριών μηνών. + +

+
+
+

+ Άρθρο 303 + + Παιδοκτονία + +

+
+
+

+ Πατέρας που με πρόθεση σκότωσε την κόρη του κατά ή μετά τον τοκετό, αλλά ενώ εξακολουθούσε + + ακόμη η διατάραξη του οργανισμού της από αυτόν, τιμωρείται με κάθειρξη έως δέκα έτη. + +

+
+
+

+ ●●● ●●●●●●●●● ●●● ●●●●●●● + +

+
+
+

+ ●●●●● ●●● + + ●●●●●●● ●●● ●●●●●● + +

+
+
+

+ ●● ●●●●●● ●●●●● ●● ●●●●●●●●● ●●● ●●●●●● ●●●●●●●●● ●●● ●●●●● ●●● ●●●●●●●●●● ●● ●●●●●●●● ●●● + + ●●●● ●●●● + +

+ +

+ 2. Όποιος με τη συναίνεση της εγκύου ή των προσώπων που έχουν τη γονική μέριμνα ή + + επιμέλειά της αν αυτή είναι ανίκανη να συναινέσει, διακόπτει την εγκυ μοσύνη της, ●●●●●●●●●● ●● + + ●●●●●●●● ●●● ●●●● ●●● Σας καλούμε να συμμετέχετε σε μία συζήτηση διερώτησης του + νόμου μέσω της mailing list: https://we.lurk.org/mailman3/lists/wordmord.we.lurk.org/ + καμία* επισημείωση είναι μόνη*, WordMord, 2022, Free Art License + +

+
+
+

+ 78 + +

+
+
+ + diff --git a/static/hocr/gynaikoktonia.hocr b/static/hocr/gynaikoktonia.hocr new file mode 100644 index 0000000..2f17b51 --- /dev/null +++ b/static/hocr/gynaikoktonia.hocr @@ -0,0 +1,142 @@ + + + + + + + + + + +
+
+

+ ΕΓΚΛΗΜΑΤΑ ΚΑΤΑ ΤΗΣ ΖΩΗΣ ΚΑΙ ΠΡΟΣΒΟΛΕΣ ΤΩΝ ΘΥΛΗΚΟΤΗΤ* + + Ι. Εγκλήματα βλάβης της ζωής της ●●●●●●●● + +

+
+
+

+ Άρθρο 299 + + Γυναικοκτονία με δόλο + +

+
+
+

+ 1. Όποιος σκότωσε άλλη τιμωρείται με κάθειρξη ισόβια ή πρόσκαιρη τουλάχιστον δέκα ετών. + + 2. Αν η πράξη αποφασίστηκε και εκτελέστηκε σε βρασμό ψυχικής ορμής, επιβάλλεται κάθειρξη. + +

+
+
+

+ ●●●●● ●●● + + ●●●●●●●●●●●●● ●●●● ●●●●●●●● + +

+
+
+

+ ●●●●●● ●●●●●●●●● ●●● ●●●●●●●● ●●●●●●●●●●●●● ●●●●●● ●●● ●●●●●●●● ●●● ●●●●●●● ●●●●●●●● + + ●●● ●●●●●●● ●●● ●●● ●●●●● ●●● ●●●●● ●●● ●●●●●● ●●● ●●●●●● ●●●●●●●● ●●●●●●●●●● ●● ●●●●●●●●● + +

+
+
+

+ Άρθρο 301 + + Συμμετοχή σε αυτοκτονία + +

+
+
+

+ Όποιος κατέπεισε άλλην να αυτοκτονήσει, αν τελέστηκε η αυτοκτονία ή έγινε απόπειρά της, + + καθώς και όποιος έδωσε βοήθεια κατά την τέλεσή της, η οποία διαφορετικά δεν θα ήταν εφικτή, + + τιμωρείται με φυλάκιση. + +

+
+
+

+ Άρθρο 302 + + Γυναικοκτονία από θεσμική αμέλεια + +

+
+
+

+ Όποιος από αμέλεια σκότωσε άλλην, τιμωρείται με φυλάκιση τουλάχιστον τριών μηνών. + +

+
+
+

+ Άρθρο 303 + + Παιδοκτονία + +

+
+
+

+ Πατέρας που με πρόθεση σκότωσε την κόρη του κατά ή μετά τον τοκετό, αλλά ενώ εξακολουθούσε + + ακόμη η διατάραξη του οργανισμού της από αυτόν, τιμωρείται με κάθειρξη έως δέκα έτη. + +

+
+
+

+ ●●● ●●●●●●●●● ●●● ●●●●●●● + +

+
+
+

+ ●●●●● ●●● + + ●●●●●●● ●●● ●●●●●● + +

+
+
+

+ ●● ●●●●●● ●●●●● ●● ●●●●●●●●● ●●● ●●●●●● ●●●●●●●●● ●●● ●●●●● ●●● ●●●●●●●●●● ●● ●●●●●●●● ●●● + + ●●●● ●●●● + +

+ +

+ ●● ●●●●●● ●● ●● ●●●●●●●●● ●●● ●●●●●● ●●● ●●●●●●●● ●●● ●●●●● ●● ●●●●●● ●●●●●●● + + ●●●●●●●●● ●●● ●● ●●●● ●●●●● ●●●●●●● ●● ●●●●●●●●●●● ●●●●●●●●● ●●● ●●●● ●●●●●●● ●●●● ●●●●●●●●●● ●● + + ●●●●●●●● ●●● ●●●● ●●● ●●●●●●●●● ●●●●● ●●● ●● ●●●●●●● ●●●● ●●●●●●●●● ●● ●●●●●●●● + + ●●●●●●●●●●● ●●● ●●●● ●●● ●●●●●●●●● ●●●●●● ●● ●●● ●●●● ●●●●●● ●●●●●●●● ●●●● ●● ●●●●● ●●● + +

+
+
+

+ 78 + +

+
+
+ + diff --git a/static/images/anthropoktonia.png b/static/images/anthropoktonia.png new file mode 100755 index 0000000..5cdc73c Binary files /dev/null and b/static/images/anthropoktonia.png differ diff --git a/static/images/blank.png b/static/images/blank.png new file mode 100755 index 0000000..c7748dc Binary files /dev/null and b/static/images/blank.png differ diff --git a/static/images/closed.gif b/static/images/closed.gif new file mode 100644 index 0000000..f9cb50b Binary files /dev/null and b/static/images/closed.gif differ diff --git a/static/images/open.gif b/static/images/open.gif new file mode 100644 index 0000000..64f2800 Binary files /dev/null and b/static/images/open.gif differ diff --git a/static/images/tongue-emoji.png b/static/images/tongue-emoji.png new file mode 100644 index 0000000..e65649d Binary files /dev/null and b/static/images/tongue-emoji.png differ diff --git a/static/styles (copy).css b/static/styles (copy).css new file mode 100644 index 0000000..d53ee19 --- /dev/null +++ b/static/styles (copy).css @@ -0,0 +1,87 @@ +@font-face { + font-family: Compagnon; + src: url(fonts/Compagnon-Roman.otf); + } + + +@font-face { + font-family: ACPoiret; + src: url(fonts/AC-Poiret.ttf); + } + + @font-face { + font-family: Solide-Mirage; + src: url(fonts/SolideMirage-Etroit.otf); + } + + @font-face { + font-family: Zarathustra; + src: url(fonts/zarathustra-v01.otf); + } + + + +body { + font-family:ACPoiret; +} + +#title1, #title2, #title3 { + font-family: Compagnon; + font-size: 1.2em; + text-align: right; +} + +#my_field { + height: 15rem; + width: 100rem; + word-wrap: break-word; + word-break: break-all; + font-size: 1.2em; + line-height: 1.6em; +} + + +#initialpdf, #showmonster, #empty_frame, #button1, #button2, #title1, #title2, #title3 { + visibility: hidden; +} + + +#empty_frame { + height: 68rem; + width: 40rem; + word-wrap: break-word; + word-break: break-all; + font-size: 1.2em; + line-height: 1.6em; +} + +button { + padding: 20px 20px; + text-align: center; + text-decoration: none; + display: inline-block; + margin: 0.2em 0.2em; + cursor: pointer; + font-size: 1em; + border-radius: 25px; + font-family: Solide-Mirage; + font-weight: bold; + box-shadow: 12px 3px rgba(253, 105, 179,0.4); + /* box-shadow: 8px 3px rgba(254, 223, 46,0.4); */ + background-color: transparent; +} + +td { +vertical-align:top; + +} + +textarea { + font-family: Zarathustra; + border-radius: 25px; + background: linear-gradient(to right,rgba(128, 128, 128,0.2), white); +} + +iframe { + border-radius: 25px; +} diff --git a/static/styles.css b/static/styles.css new file mode 100644 index 0000000..2581249 --- /dev/null +++ b/static/styles.css @@ -0,0 +1,112 @@ +@font-face { + font-family: Compagnon; + src: url(fonts/Compagnon-Roman.otf); + } + + +@font-face { + font-family: ACPoiret; + src: url(fonts/AC-Poiret.ttf); + } + + @font-face { + font-family: Solide-Mirage; + src: url(fonts/SolideMirage-Etroit.otf); + } + + @font-face { + font-family: Zarathustra; + src: url(fonts/zarathustra-v01.otf); + } + + + +body { + font-family:ACPoiret; +} + +#title1, #title2, #title3 { + font-family: Compagnon; + font-size: 1.2em; + text-align: right; +} + +#my_field { + height: 15rem; + width: 98%; + word-wrap: break-word; + word-break: break-all; + font-size: 1.2em; + line-height: 1.6em; +padding:1em; +} + + +#initialpdf, #showmonster, #empty_frame, #button1, #button2, #button4, #button5, #title1, #title2, #title3 { + visibility: hidden; +} + +#button3, #button4 { + color: #22A7A7 !important; + box-shadow: none !important; +} + +#empty_frame { + height: 50rem; + width: 95%; + word-wrap: break-word; + word-break: break-all; + font-size: 1em; + line-height: 1.6em; +padding:1em; +} + + + +button { + padding: 20px 20px; + text-align: center; + color: #8184A4; + text-decoration: none; + display: inline-block; + margin: 0.2em 0.2em; + cursor: pointer; + font-size: 1em; + border-radius: 25px; + font-family: Solide-Mirage; + font-weight: bold; + box-shadow: 12px 3px rgba(0, 64, 128,0.8); + /* box-shadow: 8px 3px rgba(254, 223, 46,0.4); */ + background-color: transparent; +} + + +td { +vertical-align:top; +overflow: hidden; + text-overflow: ellipsis; + word-wrap: break-word; + padding-right:1em; +} + +textarea { + font-family: Zarathustra; + border-radius: 25px; + border: 2px solid #004080; + /*background: linear-gradient(to right,rgba(128, 128, 128,0.2), white);*/ +} + +iframe { + border-radius: 25px; + width: 95%; + padding: 1em; +} + +table { + table-layout:fixed; + width:100%; +} + +/* #empty_frame, #title3 { +width:20rem; +} */ diff --git a/static/ΔΙΑΒΑΣΕΜΕ.txt b/static/ΔΙΑΒΑΣΕΜΕ.txt new file mode 100644 index 0000000..915f019 --- /dev/null +++ b/static/ΔΙΑΒΑΣΕΜΕ.txt @@ -0,0 +1,63 @@ +ΔΙΑΒΑΣΕΜΕ.txt + + +Οδηγίες χρήσης για το "καμία* επισημείωση δεν είναι μόνη" (καλύτερη προβολή σε desktop) + + +* Για να δείτε τις επισημειώσεις: + + * αντιγράψτε το μακροσκελές σύνδεσμο που θα βρείτε στο τέλος του κειμένου αυτού + + + * μπείτε στο παρακάτω διαδικτυακό εργαλείο μέσω του σύνδεσμου: http://poinikos.wordmord-ur.la/ + + + * κάντε επικόλληση του μακροσκελούς συνδέσμου στο άδειο παράθυρο + + + * πατήστε το κουμπί "ΠΡΟΒΟΛΗ PDF", για να αποκωδικοποιήσετε το μακροσκελές url + + + * πατήστε το κουμπί "ΑΝΤΙΓΡΑΦΗ ΠΟΙΝΙΚΟΥ ΚΩΔΙΚΑ" + Επιλέξτε το περιεχόμενο του PDF και αντιγράψτε με ctrl+C (cmnd+C) + Κάνετε επικόλληση με ctrl+v (cmnd+v) στο κενό παράθυρο που αναδύθηκε για να διαβάσετε την επισημείωση + Πάρτε το χρόνο σας + Μπορείτε να στείλετε τις τροποποιήσεις που προτείνετε στο mailing list του WordMord + + + * πατήστε το κουμπί "ΜΕΤΑΛΛΑΓΜΕΝΟΣ ΠΟΙΝΙΚΟΣ ΚΩΔΙΚΑΣ" για προβολή του τροποποιημένου PDF όπου η επισημείωση που προτείνει το WordMord αντικαθιστά το πρωτότυπο κείμενο του Ποινικού Κώδικα + + + * Για να συμμετέχετε στη συζήτηση μπορείτε να γραφτείτε στο mailing list εδώ: + + https://we.lurk.org/mailman3/lists/wordmord.we.lurk.org/ + + Θα συζητήσουμε και θα μοιραστούμε γνώσεις πάνω στο ερώτημα της στατικότητας της γλώσσας του νόμου, διερευνώντας τρόπους μετάλλαξής του + + + * Κοινοποιήστε αυτό το μήνυμα μαζί με τον σύνδεσμο του PDF κατά βούληση + + +Η καμία* επισημείωση δεν είναι μόνη είναι ένα εγχείρημα παρέμβασης στο νομικό λόγο εντός ενός ημι-δημόσιου διαλόγου που συμβαίνει μέσω του mailing list του WordMord. +Σας καλούμε να συμμετέχετε σε αυτή τη συζήτηση διερώτησης του νόμου και συγκεκριμένα του Δέκατου Πέμπτου Κεφαλαίου του Ποινικού Κώδικα, που αφορά στα Εγκλήματα κατά της Ζωής. +Το PDF του Ποινικού Κώδικα περιέχει κάποιες επισημειώσεις οι οποίες δεν είναι ορατές από την αρχή. Αυτή είναι η πρώτη από μια σειρά μεταλλάξεων που ανοίγει τη συζήτηση. + + +Με επισημειω(μα)τικούς χαιρετισμούς, + + +WordMord, + +http://wordmord-ur.la + + + +Επισημειωμένο PDF του Δέκατου Πέμπτου Κεφαλαίου του Ποινικού Κώδικα + +https://wordmord-ur.la/pdf/Dear_lovers,_this_is_a_virus._She*_invites_you_to_be_part_of_a_discussion_seeking_to_risk_the_limits_of_the_greek_penal_code/%ce%9c%cf%80%cf%81%ce%bf%cf%83%cf%84%ce%ac_%ce%bc%ce%b1%cf%82_%cf%83%cf%84%ce%b7_%ce%bc%ce%bf%ce%b9%cf%81%ce%b1%cf%83%ce%bc%ce%ad%ce%bd%ce%b7_%ce%bf%ce%b8%cf%8c%ce%bd%ce%b7_%ce%bb%ce%ac%ce%bc%cf%80%ce%b5%ce%b9_%ce%b1%cf%80%cf%8c_%cf%84%ce%bf_%cf%86%cf%89%cf%82_%cf%84%cf%89%ce%bd_%ce%bb%ce%ac%cf%80%cf%84%ce%bf%cf%80_%ce%bc%ce%b1%cf%82_%cf%84%ce%bf_%ce%94%ce%ad%ce%ba%ce%b1%cf%84%ce%bf_%ce%a0%ce%ad%ce%bc%cf%80%cf%84%ce%bf_%ce%86%cf%81%ce%b8%cf%81%ce%bf_%cf%84%ce%bf%cf%85_%ce%a0%ce%bf%ce%b9%ce%bd%ce%b9%ce%ba%ce%bf%cf%8d_%ce%9a%cf%8e%ce%b4%ce%b9%ce%ba%ce%b1/%ce%9c%cf%80%ce%bf%cf%81%ce%bf%cf%8d%ce%bc%ce%b5_%ce%bd%ce%b1_%ce%ba%ce%ac%ce%bd%ce%bf%cf%85%ce%bc%ce%b5_%cf%84%cf%81%ce%bf%cf%80%ce%bf%cf%80%ce%bf%ce%b9%ce%ae%cf%83%ce%b5%ce%b9%cf%82_%cf%80%ce%bf%cf%85_%ce%b1%ce%bd%ce%b1%cf%84%ce%b1%cf%81%ce%ac%cf%83%ce%bf%cf%85%ce%bd_%cf%84%ce%bf%ce%bd_%cf%80%ce%bf%ce%b9%ce%bd%ce%b9%ce%ba%cf%8c_%ce%ba%cf%8e%ce%b4%ce%b9%ce%ba%ce%b1,_%ce%b5%ce%ba%ce%b8%ce%ad%cf%84%ce%bf%ce%bd%cf%84%ce%ac%cf%82_%cf%84%ce%bf%ce%bd_%cf%83%ce%b5_%cf%83%cf%85%ce%b6%ce%ae%cf%84%ce%b7%cf%83%ce%b7/%ce%a0%cf%8e%cf%82_%cf%84%ce%bf_%ce%b5%ce%af%cf%80%ce%b5%cf%82;/%ce%9c%ce%b5%cf%84%ce%b1%ce%bb%ce%bb%ce%ac%ce%be%ce%b5%ce%b9%cf%82_%cf%84%ce%bf%cf%85_%cf%80%ce%bf%ce%b9%ce%bd%ce%b9%ce%ba%ce%bf%cf%8d_%ce%ba%cf%8e%ce%b4%ce%b9%ce%ba%ce%b1...Do_you_copy;_%ce%a4%ce%bf%ce%bd_%ce%ba%ce%ac%ce%bd%ce%b5%ce%b9%cf%82_%cf%83%ce%b5%ce%bb%ce%ad%ce%ba%cf%84_%ce%ba%ce%b1%ce%b9_%ce%ba%cf%8c%cf%80%ce%b9_%ce%b1%cf%80%cf%8c_%cf%84%ce%bf_pdf/%ce%9d%ce%b1%ce%b9!_%ce%9a%ce%b1%ce%b9_%cf%84%cf%8e%cf%81%ce%b1_%cf%84%ce%b9;_%ce%a4%ce%b9_%ce%ba%ce%ac%ce%bd%cf%89_%ce%bc%ce%b5_%ce%b1%cf%85%cf%84%cf%8c;/%ce%9a%ce%ac%cf%84%cf%83%ce%b5_%ce%bc%ce%b9%cf%83%cf%8c,_%cf%87%cf%8e%cf%83%cf%84%ce%bf_%cf%83%cf%84%ce%bf_%ce%b3%ce%bf%cf%85%cf%8c%cf%81%ce%bd%cf%84.../M%ce%bc%ce%bc_(%ce%bc%ce%bc%ce%bc%ce%b7%ce%b7%ce%b7!)_%cf%84%ce%bf_Word_%ce%b4%ce%b5%ce%bd_%ce%b1%ce%bd%ce%b1%ce%b3%ce%bd%cf%89%cf%81%ce%af%ce%b6%ce%b5%ce%b9_%cf%84%ce%bf%cf%85%cf%82_%cf%87%ce%b1%cf%81%ce%b1%ce%ba%cf%84%ce%ae%cf%81%ce%b5%cf%82,_%ce%b4%ce%b5_%cf%83%cf%85%ce%b3%ce%ba%cf%81%ce%b1%cf%84%ce%b5%ce%af_%cf%84%ce%bf%cf%85_%cf%84%cf%8d%cf%80%ce%bf%cf%85%cf%82_(%cf%84%ce%bf_%cf%83%ce%ba%cf%8c%cf%84%cf%89%cf%83%ce%b5!_%ce%ba%cf%84%ce%ae%ce%bd%ce%bf%cf%82!_%ce%b2%ce%b9%ce%ac%ce%b6%ce%b5%ce%b9%cf%82_%cf%84%ce%b7_%ce%b3%ce%bb%cf%8e%cf%83%cf%83%ce%b1_%ce%bc%ce%bf%cf%85!)/_%ce%92%ce%bb%ce%ad%cf%80%cf%89_(%cf%84%ce%bf_%ce%b2%ce%bb%ce%ad%cf%80%cf%89,_%ce%b5%ce%af%ce%bc%ce%b1%ce%b9_%ce%bc%ce%ac%cf%81%cf%84%cf%85%cf%81%ce%b1%cf%82_Mord)_%ce%ba%ce%ac%cf%84%ce%b9_%cf%80%ce%b5%cf%81%ce%af%ce%b5%cf%81%ce%b3%ce%b1_%cf%83%cf%8d%ce%bc%ce%b2%ce%bf%ce%bb%ce%b1/(%ce%a3-%ce%95-%ce%99-%ce%92_%ce%9c-%ce%99!!!)/%ce%94%ce%b5%ce%bd_%ce%be%ce%ad%cf%81%cf%89_%ce%b1%ce%bd_%ce%bc%cf%80%ce%bf%cf%81%cf%8e._%ce%a4%ce%97_%ce%93%ce%9b%ce%a9%ce%a3%ce%a3%ce%91_%ce%a3%ce%9f%ce%a5_%ce%9c%ce%95%ce%a3%ce%91!/%ce%98%ce%b1_%cf%84%ce%bf_%ce%ba%ce%ac%ce%bd%cf%89_%cf%80%ce%ad%ce%b9%cf%83%cf%84_%cf%83%cf%84%ce%bf_%cf%84%cf%83%ce%b1%cf%84/%ce%91!_%ce%92%ce%bb%ce%ad%cf%80%cf%89_%ce%ac%ce%bb%ce%bb%ce%bf_%ce%ba%ce%b5%ce%af%ce%bc%ce%b5%ce%bd%ce%bf_%ce%b5%ce%b4%cf%8e._%ce%94%ce%bf%cf%8d%ce%bb%ce%b5%cf%88%ce%b5_%cf%84%ce%bf_%ce%b5%cf%81%ce%b3%ce%b1%ce%bb%ce%b5%ce%af%ce%bf!/%ce%9b%ce%af%ce%b3%ce%bf_%ce%ac%cf%84%ce%bf%ce%bd%ce%bf_%cf%84%ce%bf_%ce%ba%cf%8c%ce%b2%cf%89/%ce%9d%ce%b1%ce%b9_%ce%bf%ce%b9_%cf%84%cf%8c%ce%bd%ce%bf%ce%b9_%ce%b5%ce%af%ce%bd%ce%b1%ce%b9_%cf%83%cf%84%ce%bf_%cf%83%cf%87%ce%bf%ce%bb%ce%b5%ce%b9%ce%bf/%ce%bc%ce%b1%cf%85%cf%83%cf%89%ce%bb%ce%b5%ce%b9%ce%bf/%ce%bc%ce%bf%cf%85%cf%83%ce%b5%ce%b9%ce%bf/irinodikio/%cf%83%cf%86%ce%b1%ce%b3%ce%b5%ce%b9%ce%bf/%ce%bd%ce%b5%ce%ba%cf%81%ce%bf%cf%84%ce%b1%cf%86%ce%b5%ce%b9%ce%bf/%cf%83%cf%85%ce%bb%ce%bb%ce%bf%ce%b3%ce%b9%ce%ba%ce%b7_%ce%bc%ce%bd%ce%b7%ce%bc%ce%b7.pdf + + + +καμία* επισημείωση δεν είναι μόνη, +WordMord(ɔ), 2022, Free Art Licence, +http://artlibre.org/licence/lal/en/ diff --git a/templates/base.html b/templates/base.html new file mode 100644 index 0000000..b0c899c --- /dev/null +++ b/templates/base.html @@ -0,0 +1,24 @@ + + + + + + + + + + {% block title %} {% endblock %} Ποινικός Κώδικας WordMord + + + +
+ {% block content %} {% endblock %} +
+ + diff --git a/templates/results (copy).html b/templates/results (copy).html new file mode 100644 index 0000000..b4070cc --- /dev/null +++ b/templates/results (copy).html @@ -0,0 +1,121 @@ +{% extends 'base.html' %} + +{% block content %} + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ perioxi epikollimenou keimenou / pasted text area +
+
+ + + + +
+ epishmeiomenos poinikos kodikas / annotated penal code + +
+ metalucktriomeno PDF / distorted PDF +
+
+
+ +
+
+ +
+ + + + + + +{% endblock %} diff --git a/templates/results.html b/templates/results.html new file mode 100644 index 0000000..ccbe22a --- /dev/null +++ b/templates/results.html @@ -0,0 +1,116 @@ +{% extends 'base.html' %} + +{% block content %} + + + + + + + + + + + + + + + + + + + +
+ + +
+ + + + + + + +
+
+ +
+
+ + + + + +
+ + + +{% endblock %} diff --git a/templates/results_2.html b/templates/results_2.html new file mode 100644 index 0000000..df40383 --- /dev/null +++ b/templates/results_2.html @@ -0,0 +1,101 @@ +{% extends 'base.html' %} + +{% block content %} + + + + + + + + + +
+ + + + + + +
+ perioxi epikollimenou keimenou / pasted text area + + epishmeiomenos poinikos kodikas / annotated penal code +
+ +
+ + metalucktriomeno PDF / distorted PDF + + +
+
+ + + + + + +{% endblock %}