volumetric-regimes-book/command-line/update.py

import urllib.request
import os
import re
import json
import jinja2

def API_request(url):
	"""
		url = API request url (string)
		data =  { 'query':
					'pages' :
						pageid : {
							'links' : {
								'?' : '?'
								'title' : 'pagename'
							}
						}
					}
				}
	"""
	response = urllib.request.urlopen(url).read()
	data = json.loads(response)

	# Save response as JSON to be able to inspect API call
	json_file = f'{ pagename }.json'
	print('Saving JSON:', json_file)
	with open(json_file, 'w') as out:
		out.write(json.dumps(data, indent=4))
		out.close()

	return data

def download_media(html, images):
	"""
		html = string (HTML)
		images = list of filenames (str)
	"""
	# check if 'images/' already exists
	if not os.path.exists('images'):
		os.makedirs('images')

	# download media files
	for filename in images:
		filename = filename.replace(' ', '_') # safe filenames

		# check if the image is already downloaded
		# if not, then download the file
		if not os.path.isfile(f'images/{ filename }'):

			# first we search for the full filename of the image
			url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
			response = urllib.request.urlopen(url).read()
			data = json.loads(response)

			# we select the first search result
			# (assuming that this is the image we are looking for)
			image = data['query']['allimages'][0]

			# then we download the image
			image_url = image['url']
			image_filename = image['name']
			print('Downloading:', image_filename)
			image_response = urllib.request.urlopen(image_url).read()

			# and we save it as a file
			image_path = f'images/{ image_filename }'
			out = open(image_path, 'wb')
			out.write(image_response)
			out.close()

			import time
			time.sleep(3) # do not overload the server

		# replace src link
		image_path = f'images/{ filename }'
		html = re.sub(rf'src="/book/images/.*{ filename }"', f'src="{ image_path }"', html)

	return html

def clean_up(html):
	"""
		html = string (HTML)
	"""
	html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]
	html = re.sub(r'href="/book/index.php?title=.*?"', 'href="#"', html) # remove the internal links
	return html

def parse_page(pagename):
	"""
		pagename = string
		html = string (HTML)
	"""
	parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'
	data = API_request(parse)
	# print(json.dumps(data, indent=4))
	if 'parse' in data:
		html = data['parse']['text']['*']
		images = data['parse']['images']
		html = download_media(html, images)
		html = clean_up(html)
	else:
		html = None

	return html

def save(html, pagename):
	"""
		html = string (HTML)
		pagename = string
	"""
	if html:

		# save final page that will be used with PagedJS
		template_file = open('template.html').read()
		template = jinja2.Template(template_file)
		html = template.render(publication_unfolded=publication_unfolded, title=pagename)

		html_file = f'{ pagename }.html'
		print('Saving HTML:', html_file)
		with open(html_file, 'w') as out:
			out.write(html)
			out.close()

		# save extra html page for debugging
		template_file = open('template.debug.html').read()
		template = jinja2.Template(template_file)
		html = template.render(publication_unfolded=publication_unfolded, title=pagename)

		html_file = f'{ pagename }.debug.html'
		print('Saving HTML:', html_file)
		with open(html_file, 'w') as out:
			out.write(html)
			out.close()

def update_material_now(pagename):
	"""
		pagename = string
		publication_unfolded = string (HTML)
	"""
	publication_unfolded = parse_page(pagename)

	return publication_unfolded

# ---

wiki = 'https://possiblebodies.constantvzw.org/book' # remove tail slash '/'
pagename = 'Unfolded'

publication_unfolded = update_material_now(pagename) # download the latest version of the page
save(publication_unfolded, pagename) # save the page to file