volumetric-regimes-book/command-line/update.py

152 lines
3.8 KiB
Python

import urllib.request
import os
import re
import json
import jinja2
def API_request(url):
"""
url = API request url (string)
data = { 'query':
'pages' :
pageid : {
'links' : {
'?' : '?'
'title' : 'pagename'
}
}
}
}
"""
response = urllib.request.urlopen(url).read()
data = json.loads(response)
# Save response as JSON to be able to inspect API call
json_file = f'{ pagename }.json'
print('Saving JSON:', json_file)
with open(json_file, 'w') as out:
out.write(json.dumps(data, indent=4))
out.close()
return data
def download_media(html, images):
"""
html = string (HTML)
images = list of filenames (str)
"""
# check if 'images/' already exists
if not os.path.exists('images'):
os.makedirs('images')
# download media files
for filename in images:
filename = filename.replace(' ', '_') # safe filenames
# check if the image is already downloaded
# if not, then download the file
if not os.path.isfile(f'images/{ filename }'):
# first we search for the full filename of the image
url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
response = urllib.request.urlopen(url).read()
data = json.loads(response)
# we select the first search result
# (assuming that this is the image we are looking for)
image = data['query']['allimages'][0]
# then we download the image
image_url = image['url']
image_filename = image['name']
print('Downloading:', image_filename)
image_response = urllib.request.urlopen(image_url).read()
# and we save it as a file
image_path = f'images/{ image_filename }'
out = open(image_path, 'wb')
out.write(image_response)
out.close()
import time
time.sleep(3) # do not overload the server
# replace src link
image_path = f'images/{ filename }'
html = re.sub(rf'src="/book/images/.*{ filename }"', f'src="{ image_path }"', html)
return html
def clean_up(html):
"""
html = string (HTML)
"""
html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]
html = re.sub(r'href="/book/index.php?title=.*?"', 'href="#"', html) # remove the internal links
return html
def parse_page(pagename):
"""
pagename = string
html = string (HTML)
"""
parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'
data = API_request(parse)
# print(json.dumps(data, indent=4))
if 'parse' in data:
html = data['parse']['text']['*']
images = data['parse']['images']
html = download_media(html, images)
html = clean_up(html)
else:
html = None
return html
def save(html, pagename):
"""
html = string (HTML)
pagename = string
"""
if html:
# save final page that will be used with PagedJS
template_file = open('template.html').read()
template = jinja2.Template(template_file)
html = template.render(publication_unfolded=publication_unfolded, title=pagename)
html_file = f'{ pagename }.html'
print('Saving HTML:', html_file)
with open(html_file, 'w') as out:
out.write(html)
out.close()
# save extra html page for debugging
template_file = open('template.debug.html').read()
template = jinja2.Template(template_file)
html = template.render(publication_unfolded=publication_unfolded, title=pagename)
html_file = f'{ pagename }.debug.html'
print('Saving HTML:', html_file)
with open(html_file, 'w') as out:
out.write(html)
out.close()
def update_material_now(pagename):
"""
pagename = string
publication_unfolded = string (HTML)
"""
publication_unfolded = parse_page(pagename)
return publication_unfolded
# ---
wiki = 'https://possiblebodies.constantvzw.org/book' # remove tail slash '/'
pagename = 'Unfolded'
publication_unfolded = update_material_now(pagename) # download the latest version of the page
save(publication_unfolded, pagename) # save the page to file