data-workers-publication/functions.py

#! /etc/bin/python3

import random, re, subprocess
from math import sin

from hyphen import Hyphenator
import textwrap
from textwrap2 import fill

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]|^\w+|\w+$') # initialize tokenizer

language = 'fr'
# language = 'en'

def selfwritten_linebreaks(string, linelength):
	count = 1
	tmp = ''
	new = ''
	if not 'http' in string:
		string = tokenizer.tokenize(string)
	for line_number, word in enumerate(string):
		count += len(word)
		if tmp == '':
			if word[0] == ' ':
				word = word[1:]
			if word == ' ':
				continue
		if line_number == len(string) - 1:
			tmp += word
			new += tmp
		elif count < linelength:
			tmp += word
		else:
			tmp += word
			new += tmp + '\n'
			tmp = ''
			count = 1
	return new

def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):
	count = 1
	tmp = ''
	new = ''
	if type == 'word':
		if language == 'en':
			hyphenator = Hyphenator('en_US')
		if language == 'fr':
			hyphenator = Hyphenator('fr_FR')
		paragraphs = string.split('\n')
		for i, paragraph in enumerate(paragraphs):
			try:
				tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)
			except Exception as e:
				tmp = ''
				print('Error:', e)
				print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')
				tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration

			if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
				new += tmp
			elif double_linebreaks == True:
				new += tmp + '\n\n'
			else:
				new += tmp + '\n'
		return new
	if type == 'wrap':
		paragraphs = string.split('\n')
		new = ''
		for i, paragraph in enumerate(paragraphs):
			tmp = textwrap.wrap(paragraph, width=linelength)
			tmp = '\n'.join(tmp)

			if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
					new += tmp
			elif double_linebreaks == True:
				new += tmp + '\n\n'
			else:
				new += tmp + '\n'
		return new
	if type == 'character':
		for character in string:
			if count == len(string):
				tmp += character
				new += tmp
			elif count < linelength:
				tmp += character
				count += 1
			else:
				new += tmp + '\n'
				tmp = ''
				count = 1
		return new

def fill_page(string):
	print('--- fill_page() starts ---')
	lines = string.split('\n')
	total_lines = len(lines)
	print('    total_lines   :', total_lines)
	total_pages = int(total_lines / 70)
	print('    total_pages   :', total_pages)
	full_pages_lines = 70 * total_pages
	print('    full_pages    :', full_pages_lines)
	if (total_lines - full_pages_lines) == 0:
		print('    fill_up_lines :', 0)
		page = '\n'.join(lines[:total_lines])
	else:
		fill_up_lines = 70 - (total_lines - full_pages_lines)
		print('    fill_up_lines :', fill_up_lines)
		page = string + ('\n' * (fill_up_lines))
		page_lines = page.split('\n')

		# Safety check, to see if the string can be divided by 70 lines
		if len(page_lines) % 70 != 0:
			print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)
			page = '\n'.join(page_lines[:full_pages_lines])

	print('    page(s) length:', len(page.split('\n')))
	print('--- fill_page() ends ---')
	return page + '\n'

def insert_text_block(string, inserted, left, width):
	left_column_lines = string.split('\n')
	right_column_lines = inserted.split('\n')
	tmp = False
	if len(right_column_lines) > len(left_column_lines):
		leading_iterator = right_column_lines
		follower = left_column_lines
		# print('> right = leader')
	else:
		leading_iterator = left_column_lines
		follower = right_column_lines
		# print('> left = leader')
	new = ''
	for line_number, _ in enumerate(leading_iterator):
		# print('Inserting_text_block() ... line_number:', line_number)
		# Check if there are still left_column_lines to add
		# And count the number of characters of that line
		if line_number < len(left_column_lines):
			left_column_length = len(left_column_lines[line_number])
		# If there is no line anymore, follow the length of the "left" variable
		else:
			left_column_length = left

		# Fill left_column_line up to the "left" variable
		if left_column_length < left:
			fill_up_spaces = left - len(left_column_lines[line_number])
			left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))
		else:
			left_column_line = ' ' * left

		# Append the left_ and right_column_line to the same line
		if line_number + 1 <= len(right_column_lines):
			new += left_column_line + right_column_lines[line_number] + '\n'
		# Unless there is no right_column_line anymore
		else:
			new += left_column_lines[line_number] + '\n'

	if new.endswith('\n'):
		new = new[:-1]
	return new

def insert_symbol_background(string, linelength, symbols, multiplier):
	new = ''
	lines = string.split('\n')

	for line_number, line in enumerate(lines):
		x = line_number + 1

		# Apply the multiplier, to create a gradient effect :)
		symbols += ' ' * int(x * multiplier)

		for c, character in enumerate(line):
			try:
				# if this is the last character in the line, just add it
				if c + 1 == len(line):
					character = character
				# if previous and next character is a space, add a symbol
				elif line[c-1] == ' ' and line[c+1] == ' ':
					character = character.replace(' ', random.choice(symbols))
			except:
				character = character.replace(' ', random.choice(symbols))
			new += character

			# Fill the line on the right of the text
			if c + 1 == len(line):
				new += ' '
				for _ in range(c + 1, linelength):
					new += random.choice(symbols)

		new += '\n'

	return new

def char_swap(some_string):
	swaps = [('–','-'), ('“','"'),('”','"'),('ù','u'), ("’","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]
	for swap in swaps:
		some_string = some_string.replace(swap[0], swap[1])
	return some_string

def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):
	string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)
	string = string.replace('(edition vinyle)', '') # For Javier's titles
	text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)
	# print('figlet text:', text)
	string = ''
	aligments = {
		'left': '-l',
		'right' : '-r',
		'center' : '-c'
	}
	for line in text.split('\n'):
		figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'

		# Do not include empty linebreaks in the figlet header
		for figlet_line in figlet_string.split('\n'):
			non_empty_line = re.search(r'[^\s]', figlet_line)
			if non_empty_line:
				string += figlet_line + '\n'

	return string

def align(string, linewidth, aligment='center'):
	len_string = len(string)
	margin = int((linewidth - len_string) / 2)
	return (' ' * margin) + string + (' ' * margin)

def check_element(element):
	if element.name == 'hr':
		string = ('-' * 3) + '\n'
	elif element.name == None:
		string = ''
	elif element.name == 'b':
		string = '<' + element.text + '>'
	else:
		string = element.text
	return string

def add_headers(section_type, element):
	string = ''
	# print('  ----> element:', element)

	if 'stories' in section_type or 'récits' in section_type:
		if 'h2' in element.name:
			string += '\n'
			string += '--- ' + element.text + ' ---\n'
			# string += '^' * len(element.text)
			string += '\n'
		elif 'h3' in element.name:
			header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')
			for line in header:
				string += align(line, 56) + '\n'
			string += '\n\n'
		elif element.get('class'):
			if 'toc' in element['class']:
				pass
		else:
			string += check_element(element) + '\n'

	elif 'works' in section_type:
		if language == 'en':
			linewidth = 11
		else:
			linewidth = 10

		if element.get('class'):
			if 'lemmaheader' in element['class']:
				tmp_string = '\n'
				tmp_string += ' ' * 55 + '\n'
				tmp_string += ' ' * 55 + '\n'
				tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')
				tmp_string += ' ' * 55 + '\n'
				string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)
			else:
				string += check_element(element) + '\n'
		else:
			string += check_element(element) + '\n'

	elif 'glossary' in section_type:
		if 'h2' in element.name:
			string += '''\
              ░
    ░
 ░           ░  ░  ░
        ░  ░   ░     ░
          ░     ░
  ░            ░
   ░    {}   ░
   ░
░       ░          ░
  ░    ░         ░  ░
              ░
      ░
 ░
'''.format(element.text.upper())
			string += '\n'
		else:
			string += check_element(element) + '\n'
	else:
		string += check_element(element) + '\n'

	return string


def apply_zigzag(string, pattern_width):
    count = 0
    string_lines = [line for line in string.split('\n')]
    new = ''
    fwd = True
    for line in string_lines:
        if fwd == True:
            if count <= pattern_width:
                new += (' ' * count) + line + '\n'
                count += 1
            else:
                fwd = False
                new += (' ' * count) + line + '\n'
                count -= 1
        else:
            if count >= 0:
                new += (' ' * count) + line + '\n'
                count -= 1
            else:
                fwd = True
                new += (' ' * count) + line + '\n'
                count += 1
    return new

# def text_to_pattern(string, template):
# 	template = template.split('\n')
# 	character_position = 0
# 	new = ''
# 	for line_number, line in enumerate(template):
# 		for character in line:
# 			if character == '░':
# 				new += string[character_position]
# 				character_position += 1
# 			else:
# 				new += ' '
# 		new += '\n'
# 	return new

def counting_pattern(string, linelength):
	count = 1
	pattern = ''
	tmp = ''
	string = tokenizer.tokenize(string)
	for line_number, word in enumerate(string):
		pattern += tmp + '\n'
		count += len(word)
		if '\n' in word:
			word = word.replace('\n','\n\n')
		if line_number == len(string):
			tmp += word
		elif count < linelength:
			tmp += word
		else:
			tmp += word
			# pattern += tmp + '\n'
			tmp = ''
			count = 1
	return pattern

def insert_counters_page():
	page = ''
	num = 0
	count = 2
	for line in range(1,70):
		for i in range(1,110):
			if num == 0:
				page += ' '
				count += 1
				num += 1
			elif num < 10:
				page += str(num)
				num += 1
			else:
				num = 0
				if count == 10:
					count = 0
				page += ' '
				num += 1
				count += 1
		page += '\n'
	return fill_page(page)

def insert_pagenumbers(pages):
	new = ''
	page = 0
	lines = pages.split('\n')
	for i, line in enumerate(lines):
		line_number = i + 1
		if line_number % 70 == 0:
			page += 1
			if page != 1 and line_number != len(lines) and page < 56:
				line = (' ' * page * 2) + str(page)
		new += line + '\n'
	return new

def sinus_jj():
	line_width = 110
	line_height = 70

	out = ''
	count = 0
	for x in range(line_width * line_height):
		if count == 10:
			count = 0
		s = int((sin(5 * x) + 1) * 10)
		print(s)
		out += str(count) + (' ' * s)
		count += 1

	page = ''
	linenumber = 0
	for i, c in enumerate(out):
		if i % line_width == 0:
			range_start = linenumber * line_width
			range_end = range_start + line_width
			page += out[range_start:range_end] + '\n'
			linenumber += 1

	lines = page.split('\n')
	page = '\n'.join(lines[:69]) + '\n'
	return page