Files for the publication & poster for Data Workers, an exhibition by Algolit.
http://www.algolit.net/index.php/Data_Workers
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
398 lines
11 KiB
398 lines
11 KiB
6 years ago
|
#! /etc/bin/python3
|
||
|
|
||
|
import random, re, subprocess
|
||
|
|
||
|
from hyphen import Hyphenator
|
||
|
import textwrap
|
||
|
from textwrap2 import fill
|
||
|
|
||
|
import nltk
|
||
|
from nltk.tokenize import RegexpTokenizer
|
||
|
tokenizer = RegexpTokenizer(r'[\s\W\w]\w+[\s\W\w\.]|^\w+|\w+$') # initialize tokenizer
|
||
|
|
||
|
# language = 'fr'
|
||
|
language = 'en'
|
||
|
|
||
|
def selfwritten_linebreaks(string, linelength):
|
||
|
count = 1
|
||
|
tmp = ''
|
||
|
new = ''
|
||
|
if not 'http' in string:
|
||
|
string = tokenizer.tokenize(string)
|
||
|
for line_number, word in enumerate(string):
|
||
|
count += len(word)
|
||
|
if tmp == '':
|
||
|
if word[0] == ' ':
|
||
|
word = word[1:]
|
||
|
if word == ' ':
|
||
|
continue
|
||
|
if line_number == len(string) - 1:
|
||
|
tmp += word
|
||
|
new += tmp
|
||
|
elif count < linelength:
|
||
|
tmp += word
|
||
|
else:
|
||
|
tmp += word
|
||
|
new += tmp + '\n'
|
||
|
tmp = ''
|
||
|
count = 1
|
||
|
return new
|
||
|
|
||
|
def insert_linebreaks(string, linelength, type='character', double_linebreaks=False):
|
||
|
count = 1
|
||
|
tmp = ''
|
||
|
new = ''
|
||
|
if type == 'word':
|
||
|
if language == 'en':
|
||
|
hyphenator = Hyphenator('en_US')
|
||
|
if language == 'fr':
|
||
|
hyphenator = Hyphenator('fr_FR')
|
||
|
paragraphs = string.split('\n')
|
||
|
for i, paragraph in enumerate(paragraphs):
|
||
|
try:
|
||
|
tmp = fill(paragraph, width=linelength, use_hyphenator=hyphenator)
|
||
|
except Exception as e:
|
||
|
tmp = ''
|
||
|
print('Error:', e)
|
||
|
print('>>> Hyphenator didn\'t work, selfwritten_linebreaks used instead.')
|
||
|
tmp = selfwritten_linebreaks(paragraph, linelength-3) # Calibration
|
||
|
|
||
|
if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
|
||
|
new += tmp
|
||
|
elif double_linebreaks == True:
|
||
|
new += tmp + '\n\n'
|
||
|
else:
|
||
|
new += tmp + '\n'
|
||
|
return new
|
||
|
if type == 'wrap':
|
||
|
paragraphs = string.split('\n')
|
||
|
new = ''
|
||
|
for i, paragraph in enumerate(paragraphs):
|
||
|
tmp = textwrap.wrap(paragraph, width=linelength)
|
||
|
tmp = '\n'.join(tmp)
|
||
|
|
||
|
if i + 1 == len(paragraphs): # No double linebreaks when the paragraph is the last one on the page
|
||
|
new += tmp
|
||
|
elif double_linebreaks == True:
|
||
|
new += tmp + '\n\n'
|
||
|
else:
|
||
|
new += tmp + '\n'
|
||
|
return new
|
||
|
if type == 'character':
|
||
|
for character in string:
|
||
|
if count == len(string):
|
||
|
tmp += character
|
||
|
new += tmp
|
||
|
elif count < linelength:
|
||
|
tmp += character
|
||
|
count += 1
|
||
|
else:
|
||
|
new += tmp + '\n'
|
||
|
tmp = ''
|
||
|
count = 1
|
||
|
return new
|
||
|
|
||
|
def fill_page(string):
|
||
|
print('--- fill_page() starts ---')
|
||
|
lines = string.split('\n')
|
||
|
total_lines = len(lines)
|
||
|
print(' total_lines :', total_lines)
|
||
|
total_pages = int(total_lines / 70)
|
||
|
print(' total_pages :', total_pages)
|
||
|
full_pages_lines = 70 * total_pages
|
||
|
print(' full_pages :', full_pages_lines)
|
||
|
if (total_lines - full_pages_lines) == 0:
|
||
|
print(' fill_up_lines :', 0)
|
||
|
page = '\n'.join(lines[:total_lines])
|
||
|
else:
|
||
|
fill_up_lines = 70 - (total_lines - full_pages_lines)
|
||
|
print(' fill_up_lines :', fill_up_lines)
|
||
|
page = string + ('\n' * (fill_up_lines))
|
||
|
page_lines = page.split('\n')
|
||
|
|
||
|
# Safety check, to see if the string can be divided by 70 lines
|
||
|
if len(page_lines) % 70 != 0:
|
||
|
print('>>> Careful! The modulo is cutting lines from the pages...', total_lines - full_pages_lines)
|
||
|
page = '\n'.join(page_lines[:full_pages_lines])
|
||
|
|
||
|
print(' page(s) length:', len(page.split('\n')))
|
||
|
print('--- fill_page() ends ---')
|
||
|
return page + '\n'
|
||
|
|
||
|
def insert_text_block(string, inserted, left, width):
|
||
|
left_column_lines = string.split('\n')
|
||
|
right_column_lines = inserted.split('\n')
|
||
|
tmp = False
|
||
|
if len(right_column_lines) > len(left_column_lines):
|
||
|
leading_iterator = right_column_lines
|
||
|
follower = left_column_lines
|
||
|
# print('> right = leader')
|
||
|
else:
|
||
|
leading_iterator = left_column_lines
|
||
|
follower = right_column_lines
|
||
|
# print('> left = leader')
|
||
|
new = ''
|
||
|
for line_number, _ in enumerate(leading_iterator):
|
||
|
# print('Inserting_text_block() ... line_number:', line_number)
|
||
|
# Check if there are still left_column_lines to add
|
||
|
# And count the number of characters of that line
|
||
|
if line_number < len(left_column_lines):
|
||
|
left_column_length = len(left_column_lines[line_number])
|
||
|
# If there is no line anymore, follow the length of the "left" variable
|
||
|
else:
|
||
|
left_column_length = left
|
||
|
|
||
|
# Fill left_column_line up to the "left" variable
|
||
|
if left_column_length < left:
|
||
|
fill_up_spaces = left - len(left_column_lines[line_number])
|
||
|
left_column_line = left_column_lines[line_number] + (' ' * (fill_up_spaces))
|
||
|
else:
|
||
|
left_column_line = ' ' * left
|
||
|
|
||
|
# Append the left_ and right_column_line to the same line
|
||
|
if line_number + 1 <= len(right_column_lines):
|
||
|
new += left_column_line + right_column_lines[line_number] + '\n'
|
||
|
# Unless there is no right_column_line anymore
|
||
|
else:
|
||
|
new += left_column_lines[line_number] + '\n'
|
||
|
|
||
|
if new.endswith('\n'):
|
||
|
new = new[:-1]
|
||
|
return new
|
||
|
|
||
|
def insert_symbol_background(string, linelength, symbols, multiplier):
|
||
|
new = ''
|
||
|
lines = string.split('\n')
|
||
|
|
||
|
for line_number, line in enumerate(lines):
|
||
|
x = line_number + 1
|
||
|
|
||
|
# Apply the multiplier, to create a gradient effect :)
|
||
|
symbols += ' ' * int(x * multiplier)
|
||
|
|
||
|
for c, character in enumerate(line):
|
||
|
try:
|
||
|
# if this is the last character in the line, just add it
|
||
|
if c + 1 == len(line):
|
||
|
character = character
|
||
|
# if previous and next character is a space, add a symbol
|
||
|
elif line[c-1] == ' ' and line[c+1] == ' ':
|
||
|
character = character.replace(' ', random.choice(symbols))
|
||
|
except:
|
||
|
character = character.replace(' ', random.choice(symbols))
|
||
|
new += character
|
||
|
|
||
|
# Fill the line on the right of the text
|
||
|
if c + 1 == len(line):
|
||
|
new += ' '
|
||
|
for _ in range(c + 1, linelength):
|
||
|
new += random.choice(symbols)
|
||
|
|
||
|
new += '\n'
|
||
|
|
||
|
return new
|
||
|
|
||
|
def char_swap(some_string):
|
||
|
swaps = [('–','-'), ('“','"'),('”','"'),('ù','u'), ("’","'"), ('à','a'), ('â','a'),('é','e'),('è','e'),('î','i')]
|
||
|
for swap in swaps:
|
||
|
some_string = some_string.replace(swap[0], swap[1])
|
||
|
return some_string
|
||
|
|
||
|
def convert_to_figlet_font(string, linelength, font='shadow', alignment='left'):
|
||
|
string = char_swap(string) # remove French characters in figlet titles (not all fonts include them...)
|
||
|
string = string.replace('(edition vinyle)', '') # For Javier's titles
|
||
|
text = insert_linebreaks(string, linelength, type='wrap', double_linebreaks=False)
|
||
|
# print('figlet text:', text)
|
||
|
string = ''
|
||
|
aligments = {
|
||
|
'left': '-l',
|
||
|
'right' : '-r',
|
||
|
'center' : '-c'
|
||
|
}
|
||
|
for line in text.split('\n'):
|
||
|
figlet_string = subprocess.check_output(['figlet', line, '-w', str(linelength * 6), '-n', '-f', font, '-p', aligments[alignment]]).decode() + '\n'
|
||
|
|
||
|
# Do not include empty linebreaks in the figlet header
|
||
|
for figlet_line in figlet_string.split('\n'):
|
||
|
non_empty_line = re.search(r'[^\s]', figlet_line)
|
||
|
if non_empty_line:
|
||
|
string += figlet_line + '\n'
|
||
|
|
||
|
return string
|
||
|
|
||
|
def align(string, linewidth, aligment='center'):
|
||
|
len_string = len(string)
|
||
|
margin = int((linewidth - len_string) / 2)
|
||
|
return (' ' * margin) + string + (' ' * margin)
|
||
|
|
||
|
def check_element(element):
|
||
|
if element.name == 'hr':
|
||
|
string = ('-' * 3) + '\n'
|
||
|
elif element.name == None:
|
||
|
string = ''
|
||
|
elif element.name == 'b':
|
||
|
string = '<' + element.text + '>'
|
||
|
else:
|
||
|
string = element.text
|
||
|
return string
|
||
|
|
||
|
def add_headers(section_type, element):
|
||
|
string = ''
|
||
|
# print(' ----> element:', element)
|
||
|
|
||
|
if 'stories' in section_type or 'récits' in section_type:
|
||
|
if 'h2' in element.name:
|
||
|
string += '\n'
|
||
|
string += '--- ' + element.text + ' ---\n'
|
||
|
# string += '^' * len(element.text)
|
||
|
string += '\n'
|
||
|
elif 'h3' in element.name:
|
||
|
header = element.text.upper().replace('STORIES ABOUT', 'STORIES\nABOUT').replace('RÉCITS CONTEXTUALISÉS AUTOUR', ' RÉCITS CONTEXTUALISÉS\nAUTOUR').split('\n')
|
||
|
for line in header:
|
||
|
string += align(line, 56) + '\n'
|
||
|
string += '\n\n'
|
||
|
elif element.get('class'):
|
||
|
if 'toc' in element['class']:
|
||
|
pass
|
||
|
else:
|
||
|
string += check_element(element) + '\n'
|
||
|
|
||
|
elif 'works' in section_type:
|
||
|
if language == 'en':
|
||
|
linewidth = 11
|
||
|
else:
|
||
|
linewidth = 10
|
||
|
|
||
|
if element.get('class'):
|
||
|
if 'lemmaheader' in element['class']:
|
||
|
tmp_string = '\n'
|
||
|
tmp_string += ' ' * 55 + '\n'
|
||
|
tmp_string += ' ' * 55 + '\n'
|
||
|
tmp_string += convert_to_figlet_font(element.text, linewidth, font='ogre', alignment='center')
|
||
|
tmp_string += ' ' * 55 + '\n'
|
||
|
string = insert_symbol_background(tmp_string, 55, ['0', ' ', ' ', ' ',' ', ' ', ' ',' ', ' ', ' ',' ',' ', ' ', ' ', ' ', ' '], 0)
|
||
|
else:
|
||
|
string += check_element(element) + '\n'
|
||
|
else:
|
||
|
string += check_element(element) + '\n'
|
||
|
|
||
|
elif 'glossary' in section_type:
|
||
|
if 'h2' in element.name:
|
||
|
string += '''\
|
||
|
░
|
||
|
░
|
||
|
░ ░ ░ ░
|
||
|
░ ░ ░ ░
|
||
|
░ ░
|
||
|
░ ░
|
||
|
░ {} ░
|
||
|
░
|
||
|
░ ░ ░
|
||
|
░ ░ ░ ░
|
||
|
░
|
||
|
░
|
||
|
░
|
||
|
'''.format(element.text.upper())
|
||
|
string += '\n'
|
||
|
else:
|
||
|
string += check_element(element) + '\n'
|
||
|
else:
|
||
|
string += check_element(element) + '\n'
|
||
|
|
||
|
return string
|
||
|
|
||
|
|
||
|
def apply_zigzag(string, pattern_width):
|
||
|
count = 0
|
||
|
string_lines = [line for line in string.split('\n')]
|
||
|
new = ''
|
||
|
fwd = True
|
||
|
for line in string_lines:
|
||
|
if fwd == True:
|
||
|
if count <= pattern_width:
|
||
|
new += (' ' * count) + line + '\n'
|
||
|
count += 1
|
||
|
else:
|
||
|
fwd = False
|
||
|
new += (' ' * count) + line + '\n'
|
||
|
count -= 1
|
||
|
else:
|
||
|
if count >= 0:
|
||
|
new += (' ' * count) + line + '\n'
|
||
|
count -= 1
|
||
|
else:
|
||
|
fwd = True
|
||
|
new += (' ' * count) + line + '\n'
|
||
|
count += 1
|
||
|
return new
|
||
|
|
||
|
# def text_to_pattern(string, template):
|
||
|
# template = template.split('\n')
|
||
|
# character_position = 0
|
||
|
# new = ''
|
||
|
# for line_number, line in enumerate(template):
|
||
|
# for character in line:
|
||
|
# if character == '░':
|
||
|
# new += string[character_position]
|
||
|
# character_position += 1
|
||
|
# else:
|
||
|
# new += ' '
|
||
|
# new += '\n'
|
||
|
# return new
|
||
|
|
||
|
def counting_pattern(string, linelength):
|
||
|
count = 1
|
||
|
pattern = ''
|
||
|
tmp = ''
|
||
|
string = tokenizer.tokenize(string)
|
||
|
for line_number, word in enumerate(string):
|
||
|
pattern += tmp + '\n'
|
||
|
count += len(word)
|
||
|
if '\n' in word:
|
||
|
word = word.replace('\n','\n\n')
|
||
|
if line_number == len(string):
|
||
|
tmp += word
|
||
|
elif count < linelength:
|
||
|
tmp += word
|
||
|
else:
|
||
|
tmp += word
|
||
|
# pattern += tmp + '\n'
|
||
|
tmp = ''
|
||
|
count = 1
|
||
|
return pattern
|
||
|
|
||
|
def insert_counters_page():
|
||
|
page = ''
|
||
|
num = 0
|
||
|
count = 2
|
||
|
for line in range(1,70):
|
||
|
for i in range(1,110):
|
||
|
if num == 0:
|
||
|
page += ' '
|
||
|
count += 1
|
||
|
num += 1
|
||
|
elif num < 10:
|
||
|
page += str(num)
|
||
|
num += 1
|
||
|
else:
|
||
|
num = 0
|
||
|
if count == 10:
|
||
|
count = 0
|
||
|
page += ' '
|
||
|
num += 1
|
||
|
count += 1
|
||
|
page += '\n'
|
||
|
return fill_page(page)
|
||
|
|
||
|
def insert_pagenumbers(pages):
|
||
|
new = ''
|
||
|
page = 0
|
||
|
lines = pages.split('\n')
|
||
|
for i, line in enumerate(lines):
|
||
|
line_number = i + 1
|
||
|
if line_number % 70 == 0:
|
||
|
page += 1
|
||
|
if page != 1 and line_number != len(lines) and page < 56:
|
||
|
line = (' ' * page * 2) + str(page)
|
||
|
new += line + '\n'
|
||
|
return new
|