data-workers-publication/functions.py


								#! /etc/bin/python3


								import random, re, subprocess


								from hyphen import Hyphenator

								import textwrap

								from textwrap2 import fill


								import nltk

								from nltk.tokenize import RegexpTokenizer

								tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]|^\w+|\w+$') # initialize tokenizer


								# language = 'fr'

								language = 'en'


								def selfwritten_linebreaks(string, linelength):

									count = 1

									tmp = ''

									new = ''

									if not 'http' in string:

										string = tokenizer.tokenize(string)

									for line_number, word in enumerate(string):

										count += len(word)

										if tmp == '':

											if word[0] == ' ':

												word = word[1:]

											if word == ' ':

												continue

										if line_number == len(string) - 1:

											tmp += word

											new += tmp

										elif count < linelength:

											tmp += word

										else:

											tmp += word

											new += tmp + '\n'

											tmp = ''

											count = 1

									return new


								def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):

									count = 1

									tmp = ''

									new = ''

									if type == 'word':

										if language == 'en':

											hyphenator = Hyphenator('en_US')

										if language == 'fr':

											hyphenator = Hyphenator('fr_FR')

										paragraphs = string.split('\n')

										for i, paragraph in enumerate(paragraphs):

											try:

												tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)

											except Exception as e:

												tmp = ''

												print('Error:', e)

												print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')

												tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration


											if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page

												new += tmp

											elif double_linebreaks == True:

												new += tmp + '\n\n'

											else:

												new += tmp + '\n'

										return new

									if type == 'wrap':

										paragraphs = string.split('\n')

										new = ''

										for i, paragraph in enumerate(paragraphs):

											tmp = textwrap.wrap(paragraph, width=linelength)

											tmp = '\n'.join(tmp)


											if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page

													new += tmp

											elif double_linebreaks == True:

												new += tmp + '\n\n'

											else:

												new += tmp + '\n'

										return new

									if type == 'character':

										for character in string:

											if count == len(string):

												tmp += character

												new += tmp

											elif count < linelength:

												tmp += character

												count += 1

											else:

												new += tmp + '\n'

												tmp = ''

												count = 1

										return new


								def fill_page(string):

									print('--- fill_page() starts ---')

									lines = string.split('\n')

									total_lines = len(lines)

									print('    total_lines   :', total_lines)

									total_pages = int(total_lines / 70)

									print('    total_pages   :', total_pages)

									full_pages_lines = 70 * total_pages

									print('    full_pages    :', full_pages_lines)

									if (total_lines - full_pages_lines) == 0:

										print('    fill_up_lines :', 0)

										page = '\n'.join(lines[:total_lines])

									else:

										fill_up_lines = 70 - (total_lines - full_pages_lines)

										print('    fill_up_lines :', fill_up_lines)

										page = string + ('\n' * (fill_up_lines))

										page_lines = page.split('\n')


										# Safety check, to see if the string can be divided by 70 lines

										if len(page_lines) % 70 != 0:

											print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)

											page = '\n'.join(page_lines[:full_pages_lines])


									print('    page(s) length:', len(page.split('\n')))

									print('--- fill_page() ends ---')

									return page + '\n'


								def insert_text_block(string, inserted, left, width):

									left_column_lines = string.split('\n')

									right_column_lines = inserted.split('\n')

									tmp = False

									if len(right_column_lines) > len(left_column_lines):

										leading_iterator = right_column_lines

										follower = left_column_lines

										# print('> right = leader')

									else:

										leading_iterator = left_column_lines

										follower = right_column_lines

										# print('> left = leader')

									new = ''

									for line_number, _ in enumerate(leading_iterator):

										# print('Inserting_text_block() ... line_number:', line_number)

										# Check if there are still left_column_lines to add

										# And count the number of characters of that line

										if line_number < len(left_column_lines):

											left_column_length = len(left_column_lines[line_number])

										# If there is no line anymore, follow the length of the "left" variable

										else:

											left_column_length = left


										# Fill left_column_line up to the "left" variable

										if left_column_length < left:

											fill_up_spaces = left - len(left_column_lines[line_number])

											left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))

										else:

											left_column_line = ' ' * left


										# Append the left_ and right_column_line to the same line

										if line_number + 1 <= len(right_column_lines):

											new += left_column_line + right_column_lines[line_number] + '\n'

										# Unless there is no right_column_line anymore

										else:

											new += left_column_lines[line_number] + '\n'


									if new.endswith('\n'):

										new = new[:-1]

									return new


								def insert_symbol_background(string, linelength, symbols, multiplier):

									new = ''

									lines = string.split('\n')


									for line_number, line in enumerate(lines):

										x = line_number + 1


										# Apply the multiplier, to create a gradient effect :)

										symbols += ' ' * int(x * multiplier)


										for c, character in enumerate(line):

											try:

												# if this is the last character in the line, just add it

												if c + 1 == len(line):

													character = character

												# if previous and next character is a space, add a symbol

												elif line[c-1] == ' ' and line[c+1] == ' ':

													character = character.replace(' ', random.choice(symbols))

											except:

												character = character.replace(' ', random.choice(symbols))

											new += character


											# Fill the line on the right of the text

											if c + 1 == len(line):

												new += ' '

												for _ in range(c + 1, linelength):

													new += random.choice(symbols)


										new += '\n'


									return new


								def char_swap(some_string):

									swaps = [('–','-'), ('“','"'),('”','"'),('ù','u'), ("’","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]

									for swap in swaps:

										some_string = some_string.replace(swap[0], swap[1])

									return some_string


								def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):

									string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)

									string = string.replace('(edition vinyle)', '') # For Javier's titles

									text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)

									# print('figlet text:', text)

									string = ''

									aligments = {

										'left': '-l',

										'right' : '-r',

										'center' : '-c'

									}

									for line in text.split('\n'):

										figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'


										# Do not include empty linebreaks in the figlet header

										for figlet_line in figlet_string.split('\n'):

											non_empty_line = re.search(r'[^\s]', figlet_line)

											if non_empty_line:

												string += figlet_line + '\n'


									return string


								def align(string, linewidth, aligment='center'):

									len_string = len(string)

									margin = int((linewidth - len_string) / 2)

									return (' ' * margin) + string + (' ' * margin)


								def check_element(element):

									if element.name == 'hr':

										string = ('-' * 3) + '\n'

									elif element.name == None:

										string = ''

									elif element.name == 'b':

										string = '<' + element.text + '>'

									else:

										string = element.text

									return string


								def add_headers(section_type, element):

									string = ''

									# print('  ----> element:', element)


									if 'stories' in section_type or 'récits' in section_type:

										if 'h2' in element.name:

											string += '\n'

											string += '--- ' + element.text + ' ---\n'

											# string += '^' * len(element.text)

											string += '\n'

										elif 'h3' in element.name:

											header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')

											for line in header:

												string += align(line, 56) + '\n'

											string += '\n\n'

										elif element.get('class'):

											if 'toc' in element['class']:

												pass

										else:

											string += check_element(element) + '\n'


									elif 'works' in section_type:

										if language == 'en':

											linewidth = 11

										else:

											linewidth = 10


										if element.get('class'):

											if 'lemmaheader' in element['class']:

												tmp_string = '\n'

												tmp_string += ' ' * 55 + '\n'

												tmp_string += ' ' * 55 + '\n'

												tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')

												tmp_string += ' ' * 55 + '\n'

												string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)

											else:

												string += check_element(element) + '\n'

										else:

											string += check_element(element) + '\n'


									elif 'glossary' in section_type:

										if 'h2' in element.name:

											string += '''\

								              ░

								    ░

								 ░           ░  ░  ░

								        ░  ░   ░     ░

								          ░     ░

								  ░            ░

								   ░    {}   ░

								   ░

								░       ░          ░

								  ░    ░         ░  ░

								              ░

								      ░

								 ░

								'''.format(element.text.upper())

											string += '\n'

										else:

											string += check_element(element) + '\n'

									else:

										string += check_element(element) + '\n'


									return string


								def apply_zigzag(string, pattern_width):

								    count = 0

								    string_lines = [line for line in string.split('\n')]

								    new = ''

								    fwd = True

								    for line in string_lines:

								        if fwd == True:

								            if count <= pattern_width:

								                new += (' ' * count) + line + '\n'

								                count += 1

								            else:

								                fwd = False

								                new += (' ' * count) + line + '\n'

								                count -= 1

								        else:

								            if count >= 0:

								                new += (' ' * count) + line + '\n'

								                count -= 1

								            else:

								                fwd = True

								                new += (' ' * count) + line + '\n'

								                count += 1

								    return new


								# def text_to_pattern(string, template):

								# 	template = template.split('\n')

								# 	character_position = 0

								# 	new = ''

								# 	for line_number, line in enumerate(template):

								# 		for character in line:

								# 			if character == '░':

								# 				new += string[character_position]

								# 				character_position += 1

								# 			else:

								# 				new += ' '

								# 		new += '\n'

								# 	return new


								def counting_pattern(string, linelength):

									count = 1

									pattern = ''

									tmp = ''

									string = tokenizer.tokenize(string)

									for line_number, word in enumerate(string):

										pattern += tmp + '\n'

										count += len(word)

										if '\n' in word:

											word = word.replace('\n','\n\n')

										if line_number == len(string):

											tmp += word

										elif count < linelength:

											tmp += word

										else:

											tmp += word

											# pattern += tmp + '\n'

											tmp = ''

											count = 1

									return pattern


								def insert_counters_page():

									page = ''

									num = 0

									count = 2

									for line in range(1,70):

										for i in range(1,110):

											if num == 0:

												page += ' '

												count += 1

												num += 1

											elif num < 10:

												page += str(num)

												num += 1

											else:

												num = 0

												if count == 10:

													count = 0

												page += ' '

												num += 1

												count += 1

										page += '\n'

									return fill_page(page)


								def insert_pagenumbers(pages):

									new = ''

									page = 0

									lines = pages.split('\n')

									for i, line in enumerate(lines):

										line_number = i + 1

										if line_number % 70 == 0:

											page += 1

											if page != 1 and line_number != len(lines) and page < 56:

												line = (' ' * page * 2) + str(page)

										new += line + '\n'

									return new