2021-06-16 16:46:25 +02:00
import urllib . request
import os
import re
import json
import jinja2
2021-09-01 13:06:47 +02:00
2021-06-29 19:54:51 +02:00
STATIC_FOLDER_PATH = ' ./static ' # without trailing slash
2021-09-29 20:29:54 +02:00
PUBLIC_STATIC_FOLDER_PATH = ' /static ' # without trailing slash
2021-10-13 14:26:23 +02:00
TEMPLATES_DIR = None
2021-06-29 19:54:51 +02:00
2021-09-22 17:36:30 +02:00
# This uses a low quality copy of all the images
# (using a folder with the name "images-small",
# which stores a copy of all the images generated with:
# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )
fast = False
2021-06-29 19:54:51 +02:00
def API_request ( url , pagename ) :
2021-06-16 16:46:25 +02:00
"""
url = API request url ( string )
data = { ' query ' :
' pages ' :
pageid : {
' links ' : {
' ? ' : ' ? '
' title ' : ' pagename '
}
}
}
}
"""
response = urllib . request . urlopen ( url ) . read ( )
data = json . loads ( response )
# Save response as JSON to be able to inspect API call
2021-06-29 19:54:51 +02:00
json_file = f ' { STATIC_FOLDER_PATH } / { pagename } .json '
2021-06-16 16:46:25 +02:00
print ( ' Saving JSON: ' , json_file )
with open ( json_file , ' w ' ) as out :
out . write ( json . dumps ( data , indent = 4 ) )
out . close ( )
return data
2021-06-30 19:42:14 +02:00
def download_media ( html , images , wiki ) :
2021-06-16 16:46:25 +02:00
"""
html = string ( HTML )
images = list of filenames ( str )
"""
# check if 'images/' already exists
2021-06-29 19:54:51 +02:00
if not os . path . exists ( f ' { STATIC_FOLDER_PATH } /images ' ) :
os . makedirs ( f ' { STATIC_FOLDER_PATH } /images ' )
2021-06-16 16:46:25 +02:00
2021-12-03 00:11:42 +01:00
# tmp list for filename replacements
replaced = [ ]
images . sort ( )
images . reverse ( ) # reverse to make sure that 01.png does not override Image01.png in the filename replacements later
2021-06-16 16:46:25 +02:00
# download media files
for filename in images :
filename = filename . replace ( ' ' , ' _ ' ) # safe filenames
# check if the image is already downloaded
# if not, then download the file
2021-06-29 19:54:51 +02:00
if not os . path . isfile ( f ' { STATIC_FOLDER_PATH } /images/ { filename } ' ) :
2021-06-16 16:46:25 +02:00
# first we search for the full filename of the image
url = f ' { wiki } /api.php?action=query&list=allimages&aifrom= { filename } &format=json '
response = urllib . request . urlopen ( url ) . read ( )
data = json . loads ( response )
# we select the first search result
# (assuming that this is the image we are looking for)
2021-09-29 18:54:41 +02:00
image = data [ ' query ' ] [ ' allimages ' ] [ 0 ]
2021-06-16 16:46:25 +02:00
# then we download the image
image_url = image [ ' url ' ]
image_filename = image [ ' name ' ]
print ( ' Downloading: ' , image_filename )
image_response = urllib . request . urlopen ( image_url ) . read ( )
# and we save it as a file
2021-06-29 19:54:51 +02:00
image_path = f ' { STATIC_FOLDER_PATH } /images/ { image_filename } '
2021-06-16 16:46:25 +02:00
out = open ( image_path , ' wb ' )
out . write ( image_response )
out . close ( )
import time
time . sleep ( 3 ) # do not overload the server
2021-12-03 00:11:42 +01:00
# replace src image link (from wiki folder structure to local folder)
2021-09-29 20:29:54 +02:00
image_path = f ' { PUBLIC_STATIC_FOLDER_PATH } /images/ { filename } ' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
2021-12-03 00:11:42 +01:00
img_path_patterns = [ rf ' (?<! \ .)/images/.*?px- { filename } ' , rf ' (?<! \ .)/images/.*? { filename } ' ]
for img_path_pattern in img_path_patterns :
matches = re . findall ( img_path_pattern , html ) # for debugging
# print(f'{ filename }\n')
if matches :
for match in matches :
if match not in replaced :
# print(f' { match } --> { image_path }') # for debugging: each image should have the correct match!
html = html . replace ( match , image_path )
replaced . append ( match )
# else:
# print(' already replaced!')
# print('\n------\n')
# break
# else:
# print(' no match!')
# print('\n------\n')
2021-06-16 16:46:25 +02:00
return html
2021-09-16 18:37:23 +02:00
def add_item_inventory_links ( html ) :
"""
html = string ( HTML )
"""
2021-12-03 00:11:42 +01:00
# THROUGHOUT THE BOOK
2021-09-22 17:36:30 +02:00
# Find all references in the text to the item index
2021-12-03 00:11:42 +01:00
matches = re . findall ( r ' \ w.*?Item \ d \ d \ d.*? \ w \ w \ w ' , html ) # Dodgy attempt to find unique patterns for each mentioning of Item ###
2021-09-16 18:37:23 +02:00
index = { }
2021-12-03 00:11:42 +01:00
for match in matches :
item_match = re . search ( r ' Item \ d \ d \ d ' , match )
item = item_match . group ( )
number = item . replace ( ' Item ' , ' ' ) . strip ( )
text = match . replace ( f ' Item { number } ' , ' ' )
if not number in index :
index [ number ] = [ ]
count = 1
else :
count = index [ number ] [ - 1 ] + 1
index [ number ] . append ( count )
item_id = f ' ii- { number } - { index [ number ] [ - 1 ] } '
print ( f ' match: { number } --> { item_id } --> { text } ' )
html = html . replace ( match , f ' <a id= " { item_id } " href= " #Item_Index " >Item { number } </a> { text } ' )
# IN THE ITEM INDEX
2021-09-22 17:36:30 +02:00
# Also add a <span> around the index nr to style it
2021-12-03 00:11:42 +01:00
matches = re . findall ( r ' <li> \ d \ d \ d ' , html )
2021-09-16 18:37:23 +02:00
for match in matches :
2021-12-03 00:11:42 +01:00
html = html . replace ( match , f ' <li><span class= " item_nr " > { match } </span> ' )
2021-09-22 17:36:30 +02:00
2021-12-03 00:11:42 +01:00
print ( " \n ------------- \n " )
print ( " The following items ( ' ### ' ) appear [#, #, ...] many times in the book: \n " )
sorted_index = dict ( sorted ( index . items ( ) ) )
print ( sorted_index )
print ( " \n ------------- \n " )
2021-09-22 17:36:30 +02:00
2021-12-03 00:11:42 +01:00
return html
2021-09-16 18:37:23 +02:00
2021-10-06 17:17:10 +02:00
def tweaking ( html ) :
"""
html = string ( HTML )
"""
html = html . replace ( ' <a href= " #X,_y,_z_(4_filmstills) " ' , ' <a href= " #x,_y,_z_(4_filmstills) " ' ) # change the anchor link in the TOC to lowercase
html = html . replace ( ' <a href= " #Rehearsal_as_the_ %E 2 %80% 98Other %E 2 %80% 99_to_Hypercomputation " ' , ' <a href= " #Rehearsal_as_the_‘ Other’ _to_Hypercomputation " ' ) # change the anchor link in the TOC to lowercase
html = html . replace ( ' <a href= " #We_hardly_encounter_anything_that_didn %E 2 %80% 99t_really_matter " ' , ' <a href= " #We_hardly_encounter_anything_that_didn’ t_really_matter " ' ) # change the anchor link in the TOC to lowercase
2021-10-20 18:30:18 +02:00
html = re . sub ( r ''' <h3><span class= " mw-headline " id= " References.*? " >References</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h3>
2021-10-13 19:11:14 +02:00
< ul > ''' , ''' < h3 class = " references " > < span class = " mw-headline " id = " References " > References < / span > < span class = " mw-editsection " > < span class = " mw-editsection-bracket " > < / span > < / span > < / h3 >
2021-10-20 18:30:18 +02:00
< ul class = " references " > ''' , html) # add id= " references " to h3 and ul, so the elements can be selected with CSS
2021-10-26 21:49:31 +02:00
html = html . replace ( ' src= " ./images/Userinfo.jpg " ' , ' src= " ./images/Userinfo.svg " ' ) # This image is not on the wiki
html = html . replace ( ' src= " ./images/Topology-typography-1A.png " ' , ' src= " ./images/Topology-typography-1A.svg " ' ) # This image is not on the wiki
html = html . replace ( ' src= " ./images/Topology-typography-1B.png " ' , ' src= " ./images/Topology-typography-1B.svg " ' ) # This image is not on the wiki
html = html . replace ( ' src= " ./images/Topology-typography-2A.png " ' , ' src= " ./images/Topology-typography-2A.svg " ' ) # This image is not on the wiki
html = html . replace ( ' src= " ./images/Topology-typography-2B.png " ' , ' src= " ./images/Topology-typography-2B.svg " ' ) # This image is not on the wiki
html = html . replace ( ' trans*feminis ' , ' trans✶feminis ' ) # changing stars
html = html . replace ( ' Trans*feminis ' , ' Trans✶feminis ' ) # changing stars
html = html . replace ( ' star (*) ' , ' star (✶) ' ) # changing stars
html = html . replace ( ' Our trans*feminist lens is sharpened by queer and anti-colonial sensibilities, and oriented towards (but not limited to) trans*generational, trans*media, trans*disciplinary, trans*geopolitical, trans*expertise, and trans*genealogical forms of study. ' , ' Our trans✶feminist lens is sharpened by queer and anti-colonial sensibilities, and oriented towards (but not limited to) trans✶generational, trans✶media, trans✶disciplinary, trans✶geopolitical, trans✶expertise, and trans✶genealogical forms of study. ' ) # changing stars
html = html . replace ( ' <h2><span class= " mw-headline " id= " Invasive_imagination_and_its_agential_cuts " >Invasive imagination and its agential cuts</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Invasive_imagination_and_its_agential_cuts " >Invasive imagination <br>and its agential cuts</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " Volumetric_Regimes:_Material_cultures_of_quantified_presence " >Volumetric Regimes: Material cultures of quantified presence</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Volumetric_Regimes:_Material_cultures_of_quantified_presence " >Volumetric Regimes:<br>Material cultures of<br>quantified presence</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' <h2><span id= " Somatopologies_(materials_for_a_movie_in_the_making) " ></span><span class= " mw-headline " id= " Somatopologies_.28materials_for_a_movie_in_the_making.29 " >Somatopologies (materials for a movie in the making)</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span id= " Somatopologies_(materials_for_a_movie_in_the_making) " ></span><span class= " mw-headline " id= " Somatopologies_.28materials_for_a_movie_in_the_making.29 " >Somatopologies (materials<br> for a movie in the making)</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' <h1><span class= " mw-headline " id= " Signs_of_clandestine_disorder:_The_continuous_aftermath_of_3D-computationalism " ><a href= " #Clandestine_disorder " title= " Clandestine disorder " >Signs of clandestine disorder: The continuous aftermath of 3D-computationalism</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' , ' <h1><span class= " mw-headline " id= " Signs_of_clandestine_disorder:_The_continuous_aftermath_of_3D-computationalism " ><a href= " #Clandestine_disorder " title= " Clandestine disorder " >Signs of clandestine disorder:<br>The continuous<br>aftermath of 3D-<br>computationalism</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " The_Industrial_Continuum_of_3D " >The Industrial Continuum of 3D</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " The_Industrial_Continuum_of_3D " >The Industrial Continuum<br>of 3D</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' src= " ./images/Continuum_brighton.png " ' , ' src= " ./images/Continuum_brighton.svg " ' ) # This image is not on the wiki
html = html . replace ( ' <h1><span class= " mw-headline " id= " Depths_and_Densities:_Accidented_and_dissonant_spacetimes " ><a href= " #Depths_and_densities " title= " Depths and densities " >Depths and Densities: Accidented and dissonant spacetimes</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' , ' <h1><span class= " mw-headline " id= " Depths_and_Densities:_Accidented_and_dissonant_spacetimes " ><a href= " #Depths_and_densities " title= " Depths and densities " >Depths and Densities:<br>Accidented<br>and dissonant<br>spacetimes</a></span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h1> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " Open_Boundary_Conditions:_a_grid_for_intensive_study " >Open Boundary Conditions: a grid for intensive study</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Open_Boundary_Conditions:_a_grid_for_intensive_study " >Open Boundary Conditions:<br>a grid for intensive study</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
html = html . replace ( ' <h2><span class= " mw-headline " id= " Depths_and_Densities:_A_Bugged_Report " >Depths and Densities: A Bugged Report</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' , ' <h2><span class= " mw-headline " id= " Depths_and_Densities:_A_Bugged_Report " >Depths and Densities:<br>A Bugged Report</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ' )
2021-12-03 00:11:42 +01:00
html = html . replace ( ' T*fRP ' , ' T✶fRP ' )
html = html . replace ( ' trans* ' , ' trans✶ ' )
html = html . replace ( ' Trans* ' , ' trans✶ ' )
html = html . replace ( ' (*) ' , ' (✶) ' )
2021-11-08 22:01:14 +01:00
html = html . replace ( ' ✶ ' , ' <span class= " star " >✶</span> ' )
html = html . replace ( ' <p><a href= " #File ' , ' <p class= " image " ><a href= " #File ' ) # give <p>'s that contain an non-thumb image a .image class
2021-12-03 00:11:42 +01:00
html = html . replace ( ' – ' , ' <span class= " endash " > – </span> ' ) # control the white spaces around an endash
html = html . replace ( ' — ' , ' <span class= " endash " > — </span> ' ) # control the white spaces around an endash
pattern1 = r ''' <h2><span class= " mw-headline " id= " .* " >.*</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< p > < b > . * ? < / b >
< / p > ''' # title + author
pattern2 = r ''' <h2><span class= " mw-headline " id= " .*? " >.*?</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< h2 style = " display:none; " > < span class = " mw-headline " id = " .*? " > . * ? < / span > < / h2 >
< p > < b > . * ? < / b >
< / p > ''' # exceptions: custom running headers
pattern3 = r ''' <h2><span class= " mw-headline " id= " .*? " >.*?</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2> ''' # only title
pattern4 = r ''' <h2><span id= " x,_y,_z_ \ (4_filmstills \ ) " ></span><span class= " mw-headline " id= " x.2C_y.2C_z_.284_filmstills.29 " >x, y, z \ (4 filmstills \ )</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< p > < b > Jara Rocha , Femke Snelting < / b >
< / p > '''
pattern5 = r ''' <h2><span id= " Somatopologies_ \ (materials_for_a_movie_in_the_making \ ) " ></span><span class= " mw-headline " id= " Somatopologies_.28materials_for_a_movie_in_the_making.29 " >Somatopologies \ (materials<br> for a movie in the making \ )</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< p > < b > Possible Bodies \( Jara Rocha , Femke Snelting \) < / b >
< / p > '''
pattern6 = r ''' <h2><span id= " Rehearsal_as_the_ \ ‘ Other\ ’ _to_Hypercomputation" ></span><span class= " mw-headline " id= " Rehearsal_as_the_ \ .E2 \ .80 \ .98Other \ .E2 \ .80 \ .99_to_Hypercomputation " >Rehearsal as the \ ‘ Other\ ’ to Hypercomputation</span><span class=" mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< p > < b > Maria Dada < / b >
< / p > '''
pattern7 = r ''' <h2><span id= " We_hardly_encounter_anything_that_didn’ t_really_matter " ></span><span class= " mw-headline " id= " We_hardly_encounter_anything_that_didn \ .E2 \ .80 \ .99t_really_matter " >We hardly encounter anything that didn’ t really matter</span><span class= " mw-editsection " ><span class= " mw-editsection-bracket " ></span></span></h2>
< p > < b > Phil Langley in conversation with Possible Bodies < / b >
< / p > '''
results = re . findall ( rf ' { pattern1 } | { pattern2 } | { pattern3 } | { pattern4 } | { pattern5 } | { pattern6 } | { pattern7 } ' , html )
for match in results :
html = html . replace ( match , f ' <div class= " title-wrapper " > { match } </div> ' )
# for result in results:
# print(result)
# print('---')
# print('---')
# print('AANTAL:', len(results))
# print('SET LEN:', len(set(results)))
# print('---')
# html = html.replace('<div class="multi"></div>', f'<div class="multi"><img src="{ PUBLIC_STATIC_FOLDER_PATH }/images/multi-remix.svg"></div>') # add Multi Remix as SVG
2021-10-26 21:49:31 +02:00
# html = html.replace('', '')
2021-10-13 19:11:14 +02:00
2021-10-06 17:17:10 +02:00
return html
2021-06-16 16:46:25 +02:00
def clean_up ( html ) :
"""
html = string ( HTML )
"""
html = re . sub ( r ' \ [.*edit.* \ ] ' , ' ' , html ) # remove the [edit]
2021-10-26 08:36:48 +02:00
html = re . sub ( r ' href= " /index.php \ ?title= ' , ' href= " # ' , html ) # remove the internal wiki links
2021-09-08 15:34:52 +02:00
html = re . sub ( r ' [(?= \ d) ' , ' ' , html ) # remove left footnote bracket [
html = re . sub ( r ' (?<= \ d)] ' , ' ' , html ) # remove right footnote bracket ]
2021-06-16 16:46:25 +02:00
return html
2021-09-22 17:36:30 +02:00
def fast_loader ( html ) :
"""
html = string ( HTML )
"""
if fast == True :
html = html . replace ( ' /images/ ' , ' /images-small/ ' )
print ( ' --- rendered in FAST mode --- ' )
return html
2021-06-29 19:54:51 +02:00
def parse_page ( pagename , wiki ) :
2021-06-16 16:46:25 +02:00
"""
pagename = string
html = string ( HTML )
"""
parse = f ' { wiki } /api.php?action=parse&page= { pagename } &pst=True&format=json '
2021-06-29 19:54:51 +02:00
data = API_request ( parse , pagename )
2021-06-16 16:46:25 +02:00
# print(json.dumps(data, indent=4))
if ' parse ' in data :
html = data [ ' parse ' ] [ ' text ' ] [ ' * ' ]
images = data [ ' parse ' ] [ ' images ' ]
2021-06-30 19:42:14 +02:00
html = download_media ( html , images , wiki )
2021-06-16 16:46:25 +02:00
html = clean_up ( html )
2021-09-16 18:37:23 +02:00
html = add_item_inventory_links ( html )
2021-10-06 17:17:10 +02:00
html = tweaking ( html )
2021-09-22 17:36:30 +02:00
html = fast_loader ( html )
2021-06-16 16:46:25 +02:00
else :
html = None
return html
2021-10-13 14:26:23 +02:00
def save ( html , pagename ) :
2021-06-16 16:46:25 +02:00
"""
html = string ( HTML )
pagename = string
"""
2021-10-13 14:26:23 +02:00
if __name__ == " __main__ " :
# command-line
2021-06-16 16:46:25 +02:00
# save final page that will be used with PagedJS
2021-10-13 14:26:23 +02:00
template_file = open ( f ' { STATIC_FOLDER_PATH } / { TEMPLATES_DIR } /template.html ' ) . read ( )
2021-06-16 16:46:25 +02:00
template = jinja2 . Template ( template_file )
2021-11-08 22:01:14 +01:00
doc = template . render ( publication_unfolded = html , title = pagename )
2021-06-16 16:46:25 +02:00
2021-06-29 19:54:51 +02:00
html_file = f ' { STATIC_FOLDER_PATH } / { pagename } .html '
2021-06-16 16:46:25 +02:00
print ( ' Saving HTML: ' , html_file )
with open ( html_file , ' w ' ) as out :
2021-11-08 22:01:14 +01:00
out . write ( doc )
2021-06-16 16:46:25 +02:00
out . close ( )
2021-10-13 14:26:23 +02:00
# save extra html page for debugging (CLI only)
template_file = open ( f ' { STATIC_FOLDER_PATH } / { TEMPLATES_DIR } /template.inspect.html ' ) . read ( )
2021-06-16 16:46:25 +02:00
template = jinja2 . Template ( template_file )
2021-11-08 22:01:14 +01:00
doc = template . render ( publication_unfolded = html , title = pagename )
2021-06-16 16:46:25 +02:00
2021-06-29 19:54:51 +02:00
html_file = f ' { STATIC_FOLDER_PATH } / { pagename } .inspect.html '
2021-06-16 16:46:25 +02:00
print ( ' Saving HTML: ' , html_file )
with open ( html_file , ' w ' ) as out :
2021-11-08 22:01:14 +01:00
out . write ( doc )
2021-06-16 16:46:25 +02:00
out . close ( )
2021-10-13 14:26:23 +02:00
else :
# Flask application
with open ( f ' { STATIC_FOLDER_PATH } /Unfolded.html ' , ' w ' ) as out :
out . write ( html ) # save the html to a file (without <head>)
2021-06-29 19:54:51 +02:00
def update_material_now ( pagename , wiki ) :
2021-06-16 16:46:25 +02:00
"""
pagename = string
publication_unfolded = string ( HTML )
"""
2021-06-29 19:54:51 +02:00
publication_unfolded = parse_page ( pagename , wiki )
2021-06-16 16:46:25 +02:00
return publication_unfolded
# ---
2021-06-29 19:54:51 +02:00
if __name__ == " __main__ " :
2021-06-16 16:46:25 +02:00
2021-10-26 08:36:48 +02:00
wiki = ' https://volumetricregimes.xyz ' # remove tail slash '/'
2021-06-29 19:54:51 +02:00
pagename = ' Unfolded '
publication_unfolded = update_material_now ( pagename , wiki ) # download the latest version of the page
2021-10-13 14:26:23 +02:00
save ( publication_unfolded , pagename ) # save the page to file
2021-06-16 16:46:25 +02:00