volumetric-regimes-book/command-line/update.py

230 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import urllib.request
import os
import re
import json
import jinja2
STATIC_FOLDER_PATH = '.' # without trailing slash
PUBLIC_STATIC_FOLDER_PATH = '.' # without trailing slash
WRAPPING_TEMPLATES_DIR = './templates'
# This uses a low quality copy of all the images
# (using a folder with the name "images-small",
# which stores a copy of all the images generated with:
# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )
fast = False
def API_request(url, pagename):
"""
url = API request url (string)
data = { 'query':
'pages' :
pageid : {
'links' : {
'?' : '?'
'title' : 'pagename'
}
}
}
}
"""
response = urllib.request.urlopen(url).read()
data = json.loads(response)
# Save response as JSON to be able to inspect API call
json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json'
print('Saving JSON:', json_file)
with open(json_file, 'w') as out:
out.write(json.dumps(data, indent=4))
out.close()
return data
def download_media(html, images, wiki):
"""
html = string (HTML)
images = list of filenames (str)
"""
# check if 'images/' already exists
if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):
os.makedirs(f'{ STATIC_FOLDER_PATH }/images')
# download media files
for filename in images:
filename = filename.replace(' ', '_') # safe filenames
# check if the image is already downloaded
# if not, then download the file
if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'):
# first we search for the full filename of the image
url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
response = urllib.request.urlopen(url).read()
data = json.loads(response)
# we select the first search result
# (assuming that this is the image we are looking for)
image = data['query']['allimages'][0]
# then we download the image
image_url = image['url']
image_filename = image['name']
print('Downloading:', image_filename)
image_response = urllib.request.urlopen(image_url).read()
# and we save it as a file
image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'
out = open(image_path, 'wb')
out.write(image_response)
out.close()
import time
time.sleep(3) # do not overload the server
# replace src link
image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
matches = re.findall(rf'src="/book/images/.*?px-{ filename }"', html) # for debugging
if matches:
html = re.sub(rf'src="/book/images/.*?px-{ filename }"', f'src="{ image_path }"', html)
else:
matches = re.findall(rf'src="/book/images/.*?{ filename }"', html) # for debugging
html = re.sub(rf'src="/book/images/.*?{ filename }"', f'src="{ image_path }"', html)
# print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!
return html
def add_item_inventory_links(html):
"""
html = string (HTML)
"""
# Find all references in the text to the item index
pattern = r'Item \d\d\d'
matches = re.findall(pattern, html)
index = {}
new_html = ''
from nltk.tokenize import sent_tokenize
for line in sent_tokenize(html):
for match in matches:
if match in line:
number = match.replace('Item ', '').strip()
if not number in index:
index[number] = []
count = 1
else:
count = index[number][-1] + 1
index[number].append(count)
item_id = f'ii-{ number }-{ index[number][-1] }'
line = line.replace(match, f'Item <a id="{ item_id }" href="#Item_Index">{ number }</a>')
# the line is pushed back to the new_html
new_html += line + ' '
# Also add a <span> around the index nr to style it
matches = re.findall(r'<li>\d\d\d', new_html)
for match in matches:
new_html = new_html.replace(match, f'<li><span class="item_nr">{ match }</span>')
# import json
# print(json.dumps(index, indent=4))
return new_html
def tweaking(html):
"""
html = string (HTML)
"""
html = html.replace('<a href="#X,_y,_z_(4_filmstills)"', '<a href="#x,_y,_z_(4_filmstills)"') # change the anchor link in the TOC to lowercase
html = html.replace('<a href="#Rehearsal_as_the_%E2%80%98Other%E2%80%99_to_Hypercomputation"', '<a href="#Rehearsal_as_the_Other_to_Hypercomputation"') # change the anchor link in the TOC to lowercase
html = html.replace('<a href="#We_hardly_encounter_anything_that_didn%E2%80%99t_really_matter"', '<a href="#We_hardly_encounter_anything_that_didnt_really_matter"') # change the anchor link in the TOC to lowercase
return html
def clean_up(html):
"""
html = string (HTML)
"""
html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]
html = re.sub(r'href="/book/index.php\?title=', 'href="#', html) # remove the internal wiki links
html = re.sub(r'&#91;(?=\d)', '', html) # remove left footnote bracket [
html = re.sub(r'(?<=\d)&#93;', '', html) # remove right footnote bracket ]
return html
def fast_loader(html):
"""
html = string (HTML)
"""
if fast == True:
html = html.replace('/images/', '/images-small/')
print('--- rendered in FAST mode ---')
return html
def parse_page(pagename, wiki):
"""
pagename = string
html = string (HTML)
"""
parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'
data = API_request(parse, pagename)
# print(json.dumps(data, indent=4))
if 'parse' in data:
html = data['parse']['text']['*']
images = data['parse']['images']
html = download_media(html, images, wiki)
html = clean_up(html)
html = add_item_inventory_links(html)
html = tweaking(html)
html = fast_loader(html)
else:
html = None
return html
def save(html, pagename, publication_unfolded):
"""
html = string (HTML)
pagename = string
"""
if html:
# save final page that will be used with PagedJS
template_file = open(f'{ STATIC_FOLDER_PATH }/{ WRAPPING_TEMPLATES_DIR }/template.html').read()
template = jinja2.Template(template_file)
html = template.render(publication_unfolded=publication_unfolded, title=pagename)
html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.html'
print('Saving HTML:', html_file)
with open(html_file, 'w') as out:
out.write(html)
out.close()
# save extra html page for debugging
template_file = open(f'{ STATIC_FOLDER_PATH }/{ WRAPPING_TEMPLATES_DIR }/template.inspect.html').read()
template = jinja2.Template(template_file)
html = template.render(publication_unfolded=publication_unfolded, title=pagename)
html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.inspect.html'
print('Saving HTML:', html_file)
with open(html_file, 'w') as out:
out.write(html)
out.close()
def update_material_now(pagename, wiki):
"""
pagename = string
publication_unfolded = string (HTML)
"""
publication_unfolded = parse_page(pagename, wiki)
return publication_unfolded
# ---
if __name__ == "__main__":
wiki = 'https://possiblebodies.constantvzw.org/book' # remove tail slash '/'
pagename = 'Unfolded'
publication_unfolded = update_material_now(pagename, wiki) # download the latest version of the page
save(publication_unfolded, pagename, publication_unfolded) # save the page to file