data-workers-publication/functions.py

#! /etc/bin/python3

import random, re, subprocess

from hyphen import Hyphenator
import textwrap
from textwrap2 import fill

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]|^\w+|\w+$') # initialize tokenizer

# language = 'fr'
language = 'en'

def selfwritten_linebreaks(string, linelength):
	count = 1
	tmp = ''
	new = ''
	if not 'http' in string:
		string = tokenizer.tokenize(string)
	for line_number, word in enumerate(string):
		count += len(word)
		if tmp == '':
			if word[0] == ' ':
				word = word[1:]
			if word == ' ':
				continue
		if line_number == len(string) - 1:
			tmp += word
			new += tmp
		elif count < linelength:
			tmp += word
		else:
			tmp += word
			new += tmp + '\n'
			tmp = ''
			count = 1
	return new

def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):
	count = 1
	tmp = ''
	new = ''
	if type == 'word':
		if language == 'en':
			hyphenator = Hyphenator('en_US')
		if language == 'fr':
			hyphenator = Hyphenator('fr_FR')
		paragraphs = string.split('\n')
		for i, paragraph in enumerate(paragraphs):
			try:
				tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)
			except Exception as e:
				tmp = ''
				print('Error:', e)
				print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')
				tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration
			
			if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
				new += tmp
			elif double_linebreaks == True: 
				new += tmp + '\n\n'							 
			else:
				new += tmp + '\n'
		return new
	if type == 'wrap':
		paragraphs = string.split('\n')
		new = ''
		for i, paragraph in enumerate(paragraphs):
			tmp = textwrap.wrap(paragraph, width=linelength)
			tmp = '\n'.join(tmp)

			if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
					new += tmp
			elif double_linebreaks == True: 
				new += tmp + '\n\n'							 
			else:
				new += tmp + '\n'
		return new
	if type == 'character':
		for character in string:
			if count == len(string):
				tmp += character
				new += tmp
			elif count < linelength:
				tmp += character
				count += 1
			else:
				new += tmp + '\n'
				tmp = ''
				count = 1
		return new

def fill_page(string):
	print('--- fill_page() starts ---')
	lines = string.split('\n')
	total_lines = len(lines) 
	print('    total_lines   :', total_lines)
	total_pages = int(total_lines / 70)
	print('    total_pages   :', total_pages)
	full_pages_lines = 70 * total_pages
	print('    full_pages    :', full_pages_lines)
	if (total_lines - full_pages_lines) == 0:
		print('    fill_up_lines :', 0)
		page = '\n'.join(lines[:total_lines])
	else:
		fill_up_lines = 70 - (total_lines - full_pages_lines)
		print('    fill_up_lines :', fill_up_lines)
		page = string + ('\n' * (fill_up_lines))
		page_lines = page.split('\n')

		# Safety check, to see if the string can be divided by 70 lines
		if len(page_lines) % 70 != 0:
			print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)
			page = '\n'.join(page_lines[:full_pages_lines])

	print('    page(s) length:', len(page.split('\n')))
	print('--- fill_page() ends ---')
	return page + '\n'

def insert_text_block(string, inserted, left, width):
	left_column_lines = string.split('\n')
	right_column_lines = inserted.split('\n')
	tmp = False
	if len(right_column_lines) > len(left_column_lines):
		leading_iterator = right_column_lines
		follower = left_column_lines
		# print('> right = leader')
	else:
		leading_iterator = left_column_lines
		follower = right_column_lines
		# print('> left = leader')
	new = ''
	for line_number, _ in enumerate(leading_iterator):
		# print('Inserting_text_block() ... line_number:', line_number)
		# Check if there are still left_column_lines to add
		# And count the number of characters of that line
		if line_number < len(left_column_lines):
			left_column_length = len(left_column_lines[line_number])
		# If there is no line anymore, follow the length of the "left" variable
		else: 
			left_column_length = left

		# Fill left_column_line up to the "left" variable
		if left_column_length < left:
			fill_up_spaces = left - len(left_column_lines[line_number])
			left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))
		else:
			left_column_line = ' ' * left

		# Append the left_ and right_column_line to the same line
		if line_number + 1 <= len(right_column_lines):
			new += left_column_line + right_column_lines[line_number] + '\n'
		# Unless there is no right_column_line anymore
		else:
			new += left_column_lines[line_number] + '\n'

	if new.endswith('\n'):
		new = new[:-1]
	return new

def insert_symbol_background(string, linelength, symbols, multiplier):
	new = ''
	lines = string.split('\n')

	for line_number, line in enumerate(lines):
		x = line_number + 1

		# Apply the multiplier, to create a gradient effect :)
		symbols += ' ' * int(x * multiplier)

		for c, character in enumerate(line):
			try:
				# if this is the last character in the line, just add it
				if c + 1 == len(line):
					character = character 
				# if previous and next character is a space, add a symbol
				elif line[c-1] == ' ' and line[c+1] == ' ':
					character = character.replace(' ', random.choice(symbols))
			except:
				character = character.replace(' ', random.choice(symbols))
			new += character 

			# Fill the line on the right of the text
			if c + 1 == len(line):
				new += ' '
				for _ in range(c + 1, linelength):
					new += random.choice(symbols)
	
		new += '\n'
	
	return new

def char_swap(some_string):
	swaps = [('–','-'), ('“','"'),('”','"'),('ù','u'), ("’","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]
	for swap in swaps:
		some_string = some_string.replace(swap[0], swap[1])
	return some_string

def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):
	string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)
	string = string.replace('(edition vinyle)', '') # For Javier's titles
	text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)
	# print('figlet text:', text)
	string = ''
	aligments = {
		'left': '-l',
		'right' : '-r',
		'center' : '-c'
	}
	for line in text.split('\n'):
		figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'
		
		# Do not include empty linebreaks in the figlet header
		for figlet_line in figlet_string.split('\n'):
			non_empty_line = re.search(r'[^\s]', figlet_line)
			if non_empty_line:
				string += figlet_line + '\n'

	return string

def align(string, linewidth, aligment='center'):
	len_string = len(string)
	margin = int((linewidth - len_string) / 2)
	return (' ' * margin) + string + (' ' * margin)

def check_element(element):
	if element.name == 'hr':
		string = ('-' * 3) + '\n'
	elif element.name == None:
		string = ''
	elif element.name == 'b':
		string = '<' + element.text + '>'
	else:
		string = element.text
	return string 

def add_headers(section_type, element):
	string = ''
	# print('  ----> element:', element)

	if 'stories' in section_type or 'récits' in section_type:
		if 'h2' in element.name:
			string += '\n'
			string += '--- ' + element.text + ' ---\n'
			# string += '^' * len(element.text)
			string += '\n'
		elif 'h3' in element.name:
			header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')
			for line in header:
				string += align(line, 56) + '\n'
			string += '\n\n'
		elif element.get('class'):
			if 'toc' in element['class']:
				pass
		else:
			string += check_element(element) + '\n'

	elif 'works' in section_type:
		if language == 'en':
			linewidth = 11
		else:
			linewidth = 10

		if element.get('class'):
			if 'lemmaheader' in element['class']:
				tmp_string = '\n'
				tmp_string += ' ' * 55 + '\n'
				tmp_string += ' ' * 55 + '\n'
				tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')
				tmp_string += ' ' * 55 + '\n'
				string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)
			else:
				string += check_element(element) + '\n'
		else:
			string += check_element(element) + '\n'

	elif 'glossary' in section_type:
		if 'h2' in element.name:
			string += '''\
              ░    
    ░     
 ░           ░  ░  ░ 
        ░  ░   ░     ░ 
          ░     ░
  ░            ░
   ░    {}   ░
   ░          
░       ░          ░
  ░    ░         ░  ░ 
              ░  
      ░        
 ░  
'''.format(element.text.upper())
			string += '\n'
		else:
			string += check_element(element) + '\n'
	else:
		string += check_element(element) + '\n'	

	return string


def apply_zigzag(string, pattern_width):
    count = 0
    string_lines = [line for line in string.split('\n')]
    new = ''
    fwd = True
    for line in string_lines:
        if fwd == True:
            if count <= pattern_width:
                new += (' ' * count) + line + '\n'
                count += 1
            else:
                fwd = False
                new += (' ' * count) + line + '\n'
                count -= 1
        else:
            if count >= 0:
                new += (' ' * count) + line + '\n'
                count -= 1
            else:
                fwd = True
                new += (' ' * count) + line + '\n'
                count += 1
    return new

# def text_to_pattern(string, template):
# 	template = template.split('\n')
# 	character_position = 0
# 	new = ''
# 	for line_number, line in enumerate(template):
# 		for character in line:
# 			if character == '░':
# 				new += string[character_position]
# 				character_position += 1
# 			else:
# 				new += ' '
# 		new += '\n'
# 	return new

def counting_pattern(string, linelength):
	count = 1
	pattern = ''
	tmp = ''
	string = tokenizer.tokenize(string)
	for line_number, word in enumerate(string):
		pattern += tmp + '\n'
		count += len(word)
		if '\n' in word:
			word = word.replace('\n','\n\n')
		if line_number == len(string):
			tmp += word
		elif count < linelength:
			tmp += word
		else:
			tmp += word
			# pattern += tmp + '\n'
			tmp = ''
			count = 1
	return pattern

def insert_counters_page():
	page = ''
	num = 0
	count = 2
	for line in range(1,70):
		for i in range(1,110):
			if num == 0:
				page += ' '
				count += 1
				num += 1
			elif num < 10:
				page += str(num)
				num += 1
			else:
				num = 0
				if count == 10:
					count = 0
				page += ' '
				num += 1
				count += 1
		page += '\n' 
	return fill_page(page)

def insert_pagenumbers(pages):
	new = ''
	page = 0
	lines = pages.split('\n')
	for i, line in enumerate(lines):
		line_number = i + 1
		if line_number % 70 == 0:
			page += 1
			if page != 1 and line_number != len(lines) and page < 56:
				line = (' ' * page * 2) + str(page)
		new += line + '\n'
	return new
first commit 6 years ago			`#! /etc/bin/python3`

			`import random, re, subprocess`

			`from hyphen import Hyphenator`
			`import textwrap`
			`from textwrap2 import fill`

			`import nltk`
			`from nltk.tokenize import RegexpTokenizer`
			`tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]\|^\w+\|\w+$') # initialize tokenizer`

			`# language = 'fr'`
			`language = 'en'`

			`def selfwritten_linebreaks(string, linelength):`
			`count = 1`
			`tmp = ''`
			`new = ''`
			`if not 'http' in string:`
			`string = tokenizer.tokenize(string)`
			`for line_number, word in enumerate(string):`
			`count += len(word)`
			`if tmp == '':`
			`if word[0] == ' ':`
			`word = word[1:]`
			`if word == ' ':`
			`continue`
			`if line_number == len(string) - 1:`
			`tmp += word`
			`new += tmp`
			`elif count < linelength:`
			`tmp += word`
			`else:`
			`tmp += word`
			`new += tmp + '\n'`
			`tmp = ''`
			`count = 1`
			`return new`

			`def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):`
			`count = 1`
			`tmp = ''`
			`new = ''`
			`if type == 'word':`
			`if language == 'en':`
			`hyphenator = Hyphenator('en_US')`
			`if language == 'fr':`
			`hyphenator = Hyphenator('fr_FR')`
			`paragraphs = string.split('\n')`
			`for i, paragraph in enumerate(paragraphs):`
			`try:`
			`tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)`
			`except Exception as e:`
			`tmp = ''`
			`print('Error:', e)`
			`print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')`
			`tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration`

			`if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page`
			`new += tmp`
			`elif double_linebreaks == True:`
			`new += tmp + '\n\n'`
			`else:`
			`new += tmp + '\n'`
			`return new`
			`if type == 'wrap':`
			`paragraphs = string.split('\n')`
			`new = ''`
			`for i, paragraph in enumerate(paragraphs):`
			`tmp = textwrap.wrap(paragraph, width=linelength)`
			`tmp = '\n'.join(tmp)`

			`if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page`
			`new += tmp`
			`elif double_linebreaks == True:`
			`new += tmp + '\n\n'`
			`else:`
			`new += tmp + '\n'`
			`return new`
			`if type == 'character':`
			`for character in string:`
			`if count == len(string):`
			`tmp += character`
			`new += tmp`
			`elif count < linelength:`
			`tmp += character`
			`count += 1`
			`else:`
			`new += tmp + '\n'`
			`tmp = ''`
			`count = 1`
			`return new`

			`def fill_page(string):`
			`print('--- fill_page() starts ---')`
			`lines = string.split('\n')`
			`total_lines = len(lines)`
			`print(' total_lines :', total_lines)`
			`total_pages = int(total_lines / 70)`
			`print(' total_pages :', total_pages)`
			`full_pages_lines = 70 * total_pages`
			`print(' full_pages :', full_pages_lines)`
			`if (total_lines - full_pages_lines) == 0:`
			`print(' fill_up_lines :', 0)`
			`page = '\n'.join(lines[:total_lines])`
			`else:`
			`fill_up_lines = 70 - (total_lines - full_pages_lines)`
			`print(' fill_up_lines :', fill_up_lines)`
			`page = string + ('\n' * (fill_up_lines))`
			`page_lines = page.split('\n')`

			`# Safety check, to see if the string can be divided by 70 lines`
			`if len(page_lines) % 70 != 0:`
			`print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)`
			`page = '\n'.join(page_lines[:full_pages_lines])`

			`print(' page(s) length:', len(page.split('\n')))`
			`print('--- fill_page() ends ---')`
			`return page + '\n'`

			`def insert_text_block(string, inserted, left, width):`
			`left_column_lines = string.split('\n')`
			`right_column_lines = inserted.split('\n')`
			`tmp = False`
			`if len(right_column_lines) > len(left_column_lines):`
			`leading_iterator = right_column_lines`
			`follower = left_column_lines`
			`# print('> right = leader')`
			`else:`
			`leading_iterator = left_column_lines`
			`follower = right_column_lines`
			`# print('> left = leader')`
			`new = ''`
			`for line_number, _ in enumerate(leading_iterator):`
			`# print('Inserting_text_block() ... line_number:', line_number)`
			`# Check if there are still left_column_lines to add`
			`# And count the number of characters of that line`
			`if line_number < len(left_column_lines):`
			`left_column_length = len(left_column_lines[line_number])`
			`# If there is no line anymore, follow the length of the "left" variable`
			`else:`
			`left_column_length = left`

			`# Fill left_column_line up to the "left" variable`
			`if left_column_length < left:`
			`fill_up_spaces = left - len(left_column_lines[line_number])`
			`left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))`
			`else:`
			`left_column_line = ' ' * left`

			`# Append the left_ and right_column_line to the same line`
			`if line_number + 1 <= len(right_column_lines):`
			`new += left_column_line + right_column_lines[line_number] + '\n'`
			`# Unless there is no right_column_line anymore`
			`else:`
			`new += left_column_lines[line_number] + '\n'`

			`if new.endswith('\n'):`
			`new = new[:-1]`
			`return new`

			`def insert_symbol_background(string, linelength, symbols, multiplier):`
			`new = ''`
			`lines = string.split('\n')`

			`for line_number, line in enumerate(lines):`
			`x = line_number + 1`

			`# Apply the multiplier, to create a gradient effect :)`
			`symbols += ' ' * int(x * multiplier)`

			`for c, character in enumerate(line):`
			`try:`
			`# if this is the last character in the line, just add it`
			`if c + 1 == len(line):`
			`character = character`
			`# if previous and next character is a space, add a symbol`
			`elif line[c-1] == ' ' and line[c+1] == ' ':`
			`character = character.replace(' ', random.choice(symbols))`
			`except:`
			`character = character.replace(' ', random.choice(symbols))`
			`new += character`

			`# Fill the line on the right of the text`
			`if c + 1 == len(line):`
			`new += ' '`
			`for _ in range(c + 1, linelength):`
			`new += random.choice(symbols)`

			`new += '\n'`

			`return new`

			`def char_swap(some_string):`
			`swaps = [('–','-'), ('“','"'),('”','"'),('ù','u'), ("’","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]`
			`for swap in swaps:`
			`some_string = some_string.replace(swap[0], swap[1])`
			`return some_string`

			`def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):`
			`string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)`
			`string = string.replace('(edition vinyle)', '') # For Javier's titles`
			`text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)`
			`# print('figlet text:', text)`
			`string = ''`
			`aligments = {`
			`'left': '-l',`
			`'right' : '-r',`
			`'center' : '-c'`
			`}`
			`for line in text.split('\n'):`
			`figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'`

			`# Do not include empty linebreaks in the figlet header`
			`for figlet_line in figlet_string.split('\n'):`
			`non_empty_line = re.search(r'[^\s]', figlet_line)`
			`if non_empty_line:`
			`string += figlet_line + '\n'`

			`return string`

			`def align(string, linewidth, aligment='center'):`
			`len_string = len(string)`
			`margin = int((linewidth - len_string) / 2)`
			`return (' ' * margin) + string + (' ' * margin)`

			`def check_element(element):`
			`if element.name == 'hr':`
			`string = ('-' * 3) + '\n'`
			`elif element.name == None:`
			`string = ''`
			`elif element.name == 'b':`
			`string = '<' + element.text + '>'`
			`else:`
			`string = element.text`
			`return string`

			`def add_headers(section_type, element):`
			`string = ''`
			`# print(' ----> element:', element)`

			`if 'stories' in section_type or 'récits' in section_type:`
			`if 'h2' in element.name:`
			`string += '\n'`
			`string += '--- ' + element.text + ' ---\n'`
			`# string += '^' * len(element.text)`
			`string += '\n'`
			`elif 'h3' in element.name:`
			`header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')`
			`for line in header:`
			`string += align(line, 56) + '\n'`
			`string += '\n\n'`
			`elif element.get('class'):`
			`if 'toc' in element['class']:`
			`pass`
			`else:`
			`string += check_element(element) + '\n'`

			`elif 'works' in section_type:`
			`if language == 'en':`
			`linewidth = 11`
			`else:`
			`linewidth = 10`

			`if element.get('class'):`
			`if 'lemmaheader' in element['class']:`
			`tmp_string = '\n'`
			`tmp_string += ' ' * 55 + '\n'`
			`tmp_string += ' ' * 55 + '\n'`
			`tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')`
			`tmp_string += ' ' * 55 + '\n'`
			`string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)`
			`else:`
			`string += check_element(element) + '\n'`
			`else:`
			`string += check_element(element) + '\n'`

			`elif 'glossary' in section_type:`
			`if 'h2' in element.name:`
			`string += '''\`
			`░`
			`░`
			`░ ░ ░ ░`
			`░ ░ ░ ░`
			`░ ░`
			`░ ░`
			`░ {} ░`
			`░`
			`░ ░ ░`
			`░ ░ ░ ░`
			`░`
			`░`
			`░`
			`'''.format(element.text.upper())`
			`string += '\n'`
			`else:`
			`string += check_element(element) + '\n'`
			`else:`
			`string += check_element(element) + '\n'`

			`return string`


			`def apply_zigzag(string, pattern_width):`
			`count = 0`
			`string_lines = [line for line in string.split('\n')]`
			`new = ''`
			`fwd = True`
			`for line in string_lines:`
			`if fwd == True:`
			`if count <= pattern_width:`
			`new += (' ' * count) + line + '\n'`
			`count += 1`
			`else:`
			`fwd = False`
			`new += (' ' * count) + line + '\n'`
			`count -= 1`
			`else:`
			`if count >= 0:`
			`new += (' ' * count) + line + '\n'`
			`count -= 1`
			`else:`
			`fwd = True`
			`new += (' ' * count) + line + '\n'`
			`count += 1`
			`return new`

			`# def text_to_pattern(string, template):`
			`# template = template.split('\n')`
			`# character_position = 0`
			`# new = ''`
			`# for line_number, line in enumerate(template):`
			`# for character in line:`
			`# if character == '░':`
			`# new += string[character_position]`
			`# character_position += 1`
			`# else:`
			`# new += ' '`
			`# new += '\n'`
			`# return new`

			`def counting_pattern(string, linelength):`
			`count = 1`
			`pattern = ''`
			`tmp = ''`
			`string = tokenizer.tokenize(string)`
			`for line_number, word in enumerate(string):`
			`pattern += tmp + '\n'`
			`count += len(word)`
			`if '\n' in word:`
			`word = word.replace('\n','\n\n')`
			`if line_number == len(string):`
			`tmp += word`
			`elif count < linelength:`
			`tmp += word`
			`else:`
			`tmp += word`
			`# pattern += tmp + '\n'`
			`tmp = ''`
			`count = 1`
			`return pattern`

			`def insert_counters_page():`
			`page = ''`
			`num = 0`
			`count = 2`
			`for line in range(1,70):`
			`for i in range(1,110):`
			`if num == 0:`
			`page += ' '`
			`count += 1`
			`num += 1`
			`elif num < 10:`
			`page += str(num)`
			`num += 1`
			`else:`
			`num = 0`
			`if count == 10:`
			`count = 0`
			`page += ' '`
			`num += 1`
			`count += 1`
			`page += '\n'`
			`return fill_page(page)`

			`def insert_pagenumbers(pages):`
			`new = ''`
			`page = 0`
			`lines = pages.split('\n')`
			`for i, line in enumerate(lines):`
			`line_number = i + 1`
			`if line_number % 70 == 0:`
			`page += 1`
			`if page != 1 and line_number != len(lines) and page < 56:`
			`line = (' ' * page * 2) + str(page)`
			`new += line + '\n'`
			`return new`