@ -49,6 +49,12 @@ def download_media(html, images, wiki):
if not os . path . exists ( f ' { STATIC_FOLDER_PATH } /images ' ) :
os . makedirs ( f ' { STATIC_FOLDER_PATH } /images ' )
# tmp list for filename replacements
replaced = [ ]
images . sort ( )
images . reverse ( ) # reverse to make sure that 01.png does not override Image01.png in the filename replacements later
# download media files
for filename in images :
filename = filename . replace ( ' ' , ' _ ' ) # safe filenames
@ -81,15 +87,26 @@ def download_media(html, images, wiki):
import time
time . sleep ( 3 ) # do not overload the server
# replace src link
# replace src image link (from wiki folder structure to local folder)
image_path = f ' { PUBLIC_STATIC_FOLDER_PATH } /images/ { filename } ' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
matches = re . findall ( rf ' src= " /images/.*?px- { filename } " ' , html ) # for debugging
if matches :
html = re . sub ( rf ' src= " /images/.*?px- { filename } " ' , f ' src= " { image_path } " ' , html )
else :
matches = re . findall ( rf ' src= " /images/.*? { filename } " ' , html ) # for debugging
html = re . sub ( rf ' src= " /images/.*? { filename } " ' , f ' src= " { image_path } " ' , html )
# print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!
img_path_patterns = [ rf ' (?<! \ .)/images/.*?px- { filename } ' , rf ' (?<! \ .)/images/.*? { filename } ' ]
for img_path_pattern in img_path_patterns :
matches = re . findall ( img_path_pattern , html ) # for debugging
# print(f'{ filename }\n')
if matches :
for match in matches :
if match not in replaced :
# print(f' { match } --> { image_path }') # for debugging: each image should have the correct match!
html = html . replace ( match , image_path )
replaced . append ( match )
# else:
# print(' already replaced!')
# print('\n------\n')
# break
# else:
# print(' no match!')
# print('\n------\n')
return html
@ -97,37 +114,38 @@ def add_item_inventory_links(html):
"""
html = string ( HTML )
"""
# THROUGHOUT THE BOOK
# Find all references in the text to the item index
pattern = r ' Item \ d \ d \ d '
matches = re . findall ( pattern , html )
matches = re . findall ( r ' \ w.*?Item \ d \ d \ d.*? \ w \ w \ w ' , html ) # Dodgy attempt to find unique patterns for each mentioning of Item ###
index = { }
new_html = ' '
from nltk . tokenize import sent_tokenize
for line in sent_tokenize ( html ) :
for match in matches :
if match in line :
number = match . replace ( ' Item ' , ' ' ) . strip ( )
if not number in index :
index [ number ] = [ ]
count = 1
else :
count = index [ number ] [ - 1 ] + 1
index [ number ] . append ( count )
item_id = f ' ii- { number } - { index [ number ] [ - 1 ] } '
line = line . replace ( match , f ' Item <a id= " { item_id } " href= " #Item_Index " > { number } </a> ' )
# the line is pushed back to the new_html
new_html + = line + ' '
for match in matches :
item_match = re . search ( r ' Item \ d \ d \ d ' , match )
item = item_match . group ( )
number = item . replace ( ' Item ' , ' ' ) . strip ( )
text = match . replace ( f ' Item { number } ' , ' ' )
if not number in index :
index [ number ] = [ ]
count = 1
else :
count = index [ number ] [ - 1 ] + 1
index [ number ] . append ( count )
item_id = f ' ii- { number } - { index [ number ] [ - 1 ] } '
print ( f ' match: { number } --> { item_id } --> { text } ' )
html = html . replace ( match , f ' <a id= " { item_id } " href= " #Item_Index " >Item { number } </a> { text } ' )
# IN THE ITEM INDEX
# Also add a <span> around the index nr to style it
matches = re . findall ( r ' <li> \ d \ d \ d ' , new_ html)
matches = re . findall ( r ' <li> \ d \ d \ d ' , html )
for match in matches :
new_ html = new_ html. replace ( match , f ' <li><span class= " item_nr " > { match } </span> ' )
html = html . replace ( match , f ' <li><span class= " item_nr " > { match } </span> ' )
# import json
# print(json.dumps(index, indent=4))
print ( " \n ------------- \n " )
print ( " The following items ( ' ### ' ) appear [#, #, ...] many times in the book: \n " )
sorted_index = dict ( sorted ( index . items ( ) ) )
print ( sorted_index )
print ( " \n ------------- \n " )
return new_html
return html
def tweaking ( html ) :
"""
@ -157,14 +175,7 @@ def tweaking(html):
html = html . replace ( ' <h1><span class= " mw-headline " id= " Depths_and_Densities:_Accidented_and_dissonant_spacetimes " ><a href= " #Depths_and_densities " title= " Depths and densities " >Depths and Densities: Accidented and dissonant spacetimes</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' , ' <h1><span class= " mw-headline " id= " Depths_and_Densities:_Accidented_and_dissonant_spacetimes " ><a href= " #Depths_and_densities " title= " Depths and densities " >Depths and Densities:<br>Accidented<br>and dissonant<br>spacetimes</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " Open_Boundary_Conditions:_a_grid_for_intensive_study " >Open Boundary Conditions: a grid for intensive study</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Open_Boundary_Conditions:_a_grid_for_intensive_study " >Open Boundary Conditions:<br>a grid for intensive study</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " Depths_and_Densities:_A_Bugged_Report " >Depths and Densities: A Bugged Report</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Depths_and_Densities:_A_Bugged_Report " >Depths and Densities:<br>A Bugged Report</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
# html = html.replace('trans*generational, trans*media, trans*disciplinary, trans*geopolitical, trans*expertise, and trans*genealogical concerns', 'trans✶generational, trans✶media, trans✶disciplinary, trans✶geopolitical, trans✶expertise, and trans✶genealogical concerns')
# html = html.replace('trans*generational', 'trans*generational')
# html = html.replace('trans*media', 'trans✶media')
# html = html.replace('trans*disciplinary', 'trans✶disciplinary')
# html = html.replace('trans*geopolitical', 'trans✶geopolitical')
# html = html.replace('trans*activists', 'trans✶activists')
# html = html.replace('trans*expertise', 'trans✶expertise')
# html = html.replace('trans*genealogical', 'trans✶genealogical')
html = html . replace ( ' T*fRP ' , ' T✶fRP ' )
html = html . replace ( ' trans* ' , ' trans✶ ' )
html = html . replace ( ' Trans* ' , ' trans✶ ' )
html = html . replace ( ' (*) ' , ' (✶) ' )