volumetric-regimes-book/web-interface/update.py


								import urllib.request

								import os

								import re

								import json

								import jinja2


								STATIC_FOLDER_PATH = './static' # without trailing slash

								WRAPPING_TEMPLATES_DIR = './wrapping-templates'


								def API_request(url, pagename):

									"""

										url = API request url (string)

										data =  { 'query':

													'pages' :

														pageid : {

															'links' : {

																'?' : '?'

																'title' : 'pagename'

															}

														}

													}

												}

									"""

									response = urllib.request.urlopen(url).read()

									data = json.loads(response)


									# Save response as JSON to be able to inspect API call

									json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json'

									print('Saving JSON:', json_file)

									with open(json_file, 'w') as out:

										out.write(json.dumps(data, indent=4))

										out.close()


									return data


								def download_media(html, images, wiki):

									"""

										html = string (HTML)

										images = list of filenames (str)

									"""

									# check if 'images/' already exists

									if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):

										os.makedirs(f'{ STATIC_FOLDER_PATH }/images')


									# download media files

									for filename in images:

										filename = filename.replace(' ', '_') # safe filenames


										# check if the image is already downloaded

										# if not, then download the file

										if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'):


											# first we search for the full filename of the image

											url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'

											response = urllib.request.urlopen(url).read()

											data = json.loads(response)


											# we select the first search result

											# (assuming that this is the image we are looking for)

											image = data['query']['allimages'][0]


											# then we download the image

											image_url = image['url']

											image_filename = image['name']

											print('Downloading:', image_filename)

											image_response = urllib.request.urlopen(image_url).read()


											# and we save it as a file

											image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'

											out = open(image_path, 'wb')

											out.write(image_response)

											out.close()


											import time

											time.sleep(3) # do not overload the server


										# replace src link

										image_path = f'/{ STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file

										html = re.sub(rf'src="/book/images/.*{ filename }"', f'src="{ image_path }"', html)


									return html


								def clean_up(html):

									"""

										html = string (HTML)

									"""

									html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]

									html = re.sub(r'href="/book/index.php\?title=', 'href="#', html) # remove the internal wiki links

									html = re.sub(r'&#91;(?=\d)', '', html) # remove left footnote bracket [

									html = re.sub(r'(?<=\d)&#93;', '', html) # remove right footnote bracket ]

									return html


								def parse_page(pagename, wiki):

									"""

										pagename = string

										html = string (HTML)

									"""

									parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'

									data = API_request(parse, pagename)

									# print(json.dumps(data, indent=4))

									if 'parse' in data:

										html = data['parse']['text']['*']

										images = data['parse']['images']

										html = download_media(html, images, wiki)

										html = clean_up(html)

									else:

										html = None


									return html


								def save(html, pagename, publication_unfolded):

									"""

										html = string (HTML)

										pagename = string

									"""

									if html:


										# save final page that will be used with PagedJS

										template_file = open(f'{ STATIC_FOLDER_PATH }/{ WRAPPING_TEMPLATES_DIR }/template.html').read()

										template = jinja2.Template(template_file)

										html = template.render(publication_unfolded=publication_unfolded, title=pagename)


										html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.html'

										print('Saving HTML:', html_file)

										with open(html_file, 'w') as out:

											out.write(html)

											out.close()


										# save extra html page for debugging

										template_file = open(f'{ STATIC_FOLDER_PATH }/{ WRAPPING_TEMPLATES_DIR }/template.inspect.html').read()

										template = jinja2.Template(template_file)

										html = template.render(publication_unfolded=publication_unfolded, title=pagename)


										html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.inspect.html'

										print('Saving HTML:', html_file)

										with open(html_file, 'w') as out:

											out.write(html)

											out.close()


								def update_material_now(pagename, wiki):

									"""

										pagename = string

										publication_unfolded = string (HTML)

									"""

									publication_unfolded = parse_page(pagename, wiki)


									return publication_unfolded


								# ---


								if __name__ == "__main__":


									wiki = 'https://possiblebodies.constantvzw.org/book' # remove tail slash '/'

									pagename = 'Unfolded'


									publication_unfolded = update_material_now(pagename, wiki) # download the latest version of the page

									save(publication_unfolded, pagename, publication_unfolded) # save the page to file