wiki-to-print/command-line/update.py


								import urllib.request

								import os

								import re

								import json

								import jinja2


								STATIC_FOLDER_PATH = '.' # without trailing slash

								PUBLIC_STATIC_FOLDER_PATH = '.' # without trailing slash

								TEMPLATES_DIR = './templates'


								# This uses a low quality copy of all the images

								# (using a folder with the name "images-small",

								# which stores a copy of all the images generated with:

								# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )

								fast = False


								def API_request(url, pagename):

									"""

										url = API request url (string)

										data =  { 'query':

													'pages' :

														pageid : {

															'links' : {

																'?' : '?'

																'title' : 'pagename'

															}

														}

													}

												}

									"""

									response = urllib.request.urlopen(url).read()

									data = json.loads(response)


									# Save response as JSON to be able to inspect API call

									json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json'

									print('Saving JSON:', json_file)

									with open(json_file, 'w') as out:

										out.write(json.dumps(data, indent=4))

										out.close()


									return data


								def download_media(html, images, wiki):

									"""

										html = string (HTML)

										images = list of filenames (str)

									"""

									# check if 'images/' already exists

									if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):

										os.makedirs(f'{ STATIC_FOLDER_PATH }/images')


									# download media files

									for filename in images:

										filename = filename.replace(' ', '_') # safe filenames


										# check if the image is already downloaded

										# if not, then download the file

										if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'):


											# first we search for the full filename of the image

											url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'

											response = urllib.request.urlopen(url).read()

											data = json.loads(response)


											# we select the first search result

											# (assuming that this is the image we are looking for)

											image = data['query']['allimages'][0]


											# then we download the image

											image_url = image['url']

											image_filename = image['name']

											print('Downloading:', image_filename)

											image_response = urllib.request.urlopen(image_url).read()


											# and we save it as a file

											image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'

											out = open(image_path, 'wb')

											out.write(image_response)

											out.close()


											import time

											time.sleep(3) # do not overload the server


										# replace src link

										image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file

										matches = re.findall(rf'src="/images/.*?px-{ filename }"', html) # for debugging

										if matches:

											html = re.sub(rf'src="/images/.*?px-{ filename }"', f'src="{ image_path }"', html)

										else:

											matches = re.findall(rf'src="/images/.*?{ filename }"', html) # for debugging

											html = re.sub(rf'src="/images/.*?{ filename }"', f'src="{ image_path }"', html)

										# print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!


									return html


								def add_item_inventory_links(html):

									"""

										html = string (HTML)

									"""

									# Find all references in the text to the item index

									pattern = r'Item \d\d\d'

									matches = re.findall(pattern, html)

									index = {}

									new_html = ''

									from nltk.tokenize import sent_tokenize

									for line in sent_tokenize(html):

										for match in matches:

											if match in line:

												number = match.replace('Item ', '').strip()

												if not number in index:

													index[number] = []

													count = 1

												else:

													count = index[number][-1] + 1

												index[number].append(count)

												item_id = f'ii-{ number }-{ index[number][-1] }'

												line = line.replace(match, f'Item <a id="{ item_id }" href="#Item_Index">{ number }</a>')


										# the line is pushed back to the new_html

										new_html += line + ' '


									# Also add a <span> around the index nr to style it

									matches = re.findall(r'<li>\d\d\d', new_html)

									for match in matches:

										new_html = new_html.replace(match, f'<li><span class="item_nr">{ match }</span>')


									# import json

									# print(json.dumps(index, indent=4))


									return new_html


								def clean_up(html):

									"""

										html = string (HTML)

									"""

									html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]

									html = re.sub(r'href="/index.php\?title=', 'href="#', html) # remove the internal wiki links

									html = re.sub(r'&#91;(?=\d)', '', html) # remove left footnote bracket [

									html = re.sub(r'(?<=\d)&#93;', '', html) # remove right footnote bracket ]

									return html


								def fast_loader(html):

									"""

										html = string (HTML)

									"""

									if fast == True:

										html = html.replace('/images/', '/images-small/')

										print('--- rendered in FAST mode ---')


									return html


								def parse_page(pagename, wiki):

									"""

										pagename = string

										html = string (HTML)

									"""

									parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'

									data = API_request(parse, pagename)

									# print(json.dumps(data, indent=4))

									if 'parse' in data:

										html = data['parse']['text']['*']

										images = data['parse']['images']

										html = download_media(html, images, wiki)

										html = clean_up(html)

										html = add_item_inventory_links(html)

										html = fast_loader(html)

									else:

										html = None


									return html


								def save(html, pagename):

									"""

										html = string (HTML)

										pagename = string

									"""

									if __name__ == "__main__":

										# command-line


										# save final page that will be used with PagedJS

										template_file = open(f'{ STATIC_FOLDER_PATH }/{ TEMPLATES_DIR }/template.html').read()

										template = jinja2.Template(template_file)

										doc = template.render(publication_unfolded=html, title=pagename)


										html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.html'

										print('Saving HTML:', html_file)

										with open(html_file, 'w') as out:

											out.write(doc)

											out.close()


										# save extra html page for debugging (CLI only)

										template_file = open(f'{ STATIC_FOLDER_PATH }/{ TEMPLATES_DIR }/template.inspect.html').read()

										template = jinja2.Template(template_file)

										doc = template.render(publication_unfolded=html, title=pagename)


										html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.inspect.html'

										print('Saving HTML:', html_file)

										with open(html_file, 'w') as out:

											out.write(doc)

											out.close()


									else:

										# Flask application


										with open(f'{ STATIC_FOLDER_PATH }/Unfolded.html', 'w') as out:

											out.write(html) # save the html to a file (without <head>)


								def update_material_now(pagename, wiki):

									"""

										pagename = string

										publication_unfolded = string (HTML)

									"""

									publication_unfolded = parse_page(pagename, wiki)


									return publication_unfolded


								# ---


								if __name__ == "__main__":


									wiki = 'https://example.com/wiki' # no tail slash '/'

									pagename = 'Unfolded'


									publication_unfolded = update_material_now(pagename, wiki) # download the latest version of the page

									save(publication_unfolded, pagename) # save the page to file