data-workers-publication/functions.py

426 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /etc/bin/python3
import random, re, subprocess
from math import sin
from hyphen import Hyphenator
import textwrap
from textwrap2 import fill
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]|^\w+|\w+$') # initialize tokenizer
language = 'fr'
# language = 'en'
def selfwritten_linebreaks(string, linelength):
count = 1
tmp = ''
new = ''
if not 'http' in string:
string = tokenizer.tokenize(string)
for line_number, word in enumerate(string):
count += len(word)
if tmp == '':
if word[0] == ' ':
word = word[1:]
if word == ' ':
continue
if line_number == len(string) - 1:
tmp += word
new += tmp
elif count < linelength:
tmp += word
else:
tmp += word
new += tmp + '\n'
tmp = ''
count = 1
return new
def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):
count = 1
tmp = ''
new = ''
if type == 'word':
if language == 'en':
hyphenator = Hyphenator('en_US')
if language == 'fr':
hyphenator = Hyphenator('fr_FR')
paragraphs = string.split('\n')
for i, paragraph in enumerate(paragraphs):
try:
tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)
except Exception as e:
tmp = ''
print('Error:', e)
print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')
tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration
if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
new += tmp
elif double_linebreaks == True:
new += tmp + '\n\n'
else:
new += tmp + '\n'
return new
if type == 'wrap':
paragraphs = string.split('\n')
new = ''
for i, paragraph in enumerate(paragraphs):
tmp = textwrap.wrap(paragraph, width=linelength)
tmp = '\n'.join(tmp)
if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
new += tmp
elif double_linebreaks == True:
new += tmp + '\n\n'
else:
new += tmp + '\n'
return new
if type == 'character':
for character in string:
if count == len(string):
tmp += character
new += tmp
elif count < linelength:
tmp += character
count += 1
else:
new += tmp + '\n'
tmp = ''
count = 1
return new
def fill_page(string):
print('--- fill_page() starts ---')
lines = string.split('\n')
total_lines = len(lines)
print(' total_lines :', total_lines)
total_pages = int(total_lines / 70)
print(' total_pages :', total_pages)
full_pages_lines = 70 * total_pages
print(' full_pages :', full_pages_lines)
if (total_lines - full_pages_lines) == 0:
print(' fill_up_lines :', 0)
page = '\n'.join(lines[:total_lines])
else:
fill_up_lines = 70 - (total_lines - full_pages_lines)
print(' fill_up_lines :', fill_up_lines)
page = string + ('\n' * (fill_up_lines))
page_lines = page.split('\n')
# Safety check, to see if the string can be divided by 70 lines
if len(page_lines) % 70 != 0:
print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)
page = '\n'.join(page_lines[:full_pages_lines])
print(' page(s) length:', len(page.split('\n')))
print('--- fill_page() ends ---')
return page + '\n'
def insert_text_block(string, inserted, left, width):
left_column_lines = string.split('\n')
right_column_lines = inserted.split('\n')
tmp = False
if len(right_column_lines) > len(left_column_lines):
leading_iterator = right_column_lines
follower = left_column_lines
# print('> right = leader')
else:
leading_iterator = left_column_lines
follower = right_column_lines
# print('> left = leader')
new = ''
for line_number, _ in enumerate(leading_iterator):
# print('Inserting_text_block() ... line_number:', line_number)
# Check if there are still left_column_lines to add
# And count the number of characters of that line
if line_number < len(left_column_lines):
left_column_length = len(left_column_lines[line_number])
# If there is no line anymore, follow the length of the "left" variable
else:
left_column_length = left
# Fill left_column_line up to the "left" variable
if left_column_length < left:
fill_up_spaces = left - len(left_column_lines[line_number])
left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))
else:
left_column_line = ' ' * left
# Append the left_ and right_column_line to the same line
if line_number + 1 <= len(right_column_lines):
new += left_column_line + right_column_lines[line_number] + '\n'
# Unless there is no right_column_line anymore
else:
new += left_column_lines[line_number] + '\n'
if new.endswith('\n'):
new = new[:-1]
return new
def insert_symbol_background(string, linelength, symbols, multiplier):
new = ''
lines = string.split('\n')
for line_number, line in enumerate(lines):
x = line_number + 1
# Apply the multiplier, to create a gradient effect :)
symbols += ' ' * int(x * multiplier)
for c, character in enumerate(line):
try:
# if this is the last character in the line, just add it
if c + 1 == len(line):
character = character
# if previous and next character is a space, add a symbol
elif line[c-1] == ' ' and line[c+1] == ' ':
character = character.replace(' ', random.choice(symbols))
except:
character = character.replace(' ', random.choice(symbols))
new += character
# Fill the line on the right of the text
if c + 1 == len(line):
new += ' '
for _ in range(c + 1, linelength):
new += random.choice(symbols)
new += '\n'
return new
def char_swap(some_string):
swaps = [('','-'), ('','"'),('','"'),('ù','u'), ("","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]
for swap in swaps:
some_string = some_string.replace(swap[0], swap[1])
return some_string
def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):
string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)
string = string.replace('(edition vinyle)', '') # For Javier's titles
text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)
# print('figlet text:', text)
string = ''
aligments = {
'left': '-l',
'right' : '-r',
'center' : '-c'
}
for line in text.split('\n'):
figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'
# Do not include empty linebreaks in the figlet header
for figlet_line in figlet_string.split('\n'):
non_empty_line = re.search(r'[^\s]', figlet_line)
if non_empty_line:
string += figlet_line + '\n'
return string
def align(string, linewidth, aligment='center'):
len_string = len(string)
margin = int((linewidth - len_string) / 2)
return (' ' * margin) + string + (' ' * margin)
def check_element(element):
if element.name == 'hr':
string = ('-' * 3) + '\n'
elif element.name == None:
string = ''
elif element.name == 'b':
string = '<' + element.text + '>'
else:
string = element.text
return string
def add_headers(section_type, element):
string = ''
# print(' ----> element:', element)
if 'stories' in section_type or 'récits' in section_type:
if 'h2' in element.name:
string += '\n'
string += '--- ' + element.text + ' ---\n'
# string += '^' * len(element.text)
string += '\n'
elif 'h3' in element.name:
header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')
for line in header:
string += align(line, 56) + '\n'
string += '\n\n'
elif element.get('class'):
if 'toc' in element['class']:
pass
else:
string += check_element(element) + '\n'
elif 'works' in section_type:
if language == 'en':
linewidth = 11
else:
linewidth = 10
if element.get('class'):
if 'lemmaheader' in element['class']:
tmp_string = '\n'
tmp_string += ' ' * 55 + '\n'
tmp_string += ' ' * 55 + '\n'
tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')
tmp_string += ' ' * 55 + '\n'
string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)
else:
string += check_element(element) + '\n'
else:
string += check_element(element) + '\n'
elif 'glossary' in section_type:
if 'h2' in element.name:
string += '''\
░ ░ ░ ░
░ ░ ░ ░
░ ░
░ ░
{}
░ ░ ░
░ ░ ░ ░
'''.format(element.text.upper())
string += '\n'
else:
string += check_element(element) + '\n'
else:
string += check_element(element) + '\n'
return string
def apply_zigzag(string, pattern_width):
count = 0
string_lines = [line for line in string.split('\n')]
new = ''
fwd = True
for line in string_lines:
if fwd == True:
if count <= pattern_width:
new += (' ' * count) + line + '\n'
count += 1
else:
fwd = False
new += (' ' * count) + line + '\n'
count -= 1
else:
if count >= 0:
new += (' ' * count) + line + '\n'
count -= 1
else:
fwd = True
new += (' ' * count) + line + '\n'
count += 1
return new
# def text_to_pattern(string, template):
# template = template.split('\n')
# character_position = 0
# new = ''
# for line_number, line in enumerate(template):
# for character in line:
# if character == '░':
# new += string[character_position]
# character_position += 1
# else:
# new += ' '
# new += '\n'
# return new
def counting_pattern(string, linelength):
count = 1
pattern = ''
tmp = ''
string = tokenizer.tokenize(string)
for line_number, word in enumerate(string):
pattern += tmp + '\n'
count += len(word)
if '\n' in word:
word = word.replace('\n','\n\n')
if line_number == len(string):
tmp += word
elif count < linelength:
tmp += word
else:
tmp += word
# pattern += tmp + '\n'
tmp = ''
count = 1
return pattern
def insert_counters_page():
page = ''
num = 0
count = 2
for line in range(1,70):
for i in range(1,110):
if num == 0:
page += ' '
count += 1
num += 1
elif num < 10:
page += str(num)
num += 1
else:
num = 0
if count == 10:
count = 0
page += ' '
num += 1
count += 1
page += '\n'
return fill_page(page)
def insert_pagenumbers(pages):
new = ''
page = 0
lines = pages.split('\n')
for i, line in enumerate(lines):
line_number = i + 1
if line_number % 70 == 0:
page += 1
if page != 1 and line_number != len(lines) and page < 56:
line = (' ' * page * 2) + str(page)
new += line + '\n'
return new
def sinus_jj():
line_width = 110
line_height = 70
out = ''
count = 0
for x in range(line_width * line_height):
if count == 10:
count = 0
s = int((sin(5 * x) + 1) * 10)
print(s)
out += str(count) + (' ' * s)
count += 1
page = ''
linenumber = 0
for i, c in enumerate(out):
if i % line_width == 0:
range_start = linenumber * line_width
range_end = range_start + line_width
page += out[range_start:range_end] + '\n'
linenumber += 1
lines = page.split('\n')
page = '\n'.join(lines[:69]) + '\n'
return page