import urllib.request import os import re import json import jinja2 STATIC_FOLDER_PATH = './static' # without trailing slash def API_request(url, pagename): """ url = API request url (string) data = { 'query': 'pages' : pageid : { 'links' : { '?' : '?' 'title' : 'pagename' } } } } """ response = urllib.request.urlopen(url).read() data = json.loads(response) # Save response as JSON to be able to inspect API call json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json' print('Saving JSON:', json_file) with open(json_file, 'w') as out: out.write(json.dumps(data, indent=4)) out.close() return data def download_media(html, images, wiki): """ html = string (HTML) images = list of filenames (str) """ # check if 'images/' already exists if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'): os.makedirs(f'{ STATIC_FOLDER_PATH }/images') # download media files for filename in images: filename = filename.replace(' ', '_') # safe filenames # check if the image is already downloaded # if not, then download the file if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'): # first we search for the full filename of the image url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) # we select the first search result # (assuming that this is the image we are looking for) image = data['query']['allimages'][0] # then we download the image image_url = image['url'] image_filename = image['name'] print('Downloading:', image_filename) image_response = urllib.request.urlopen(image_url).read() # and we save it as a file image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }' out = open(image_path, 'wb') out.write(image_response) out.close() import time time.sleep(3) # do not overload the server # replace src link image_path = f'/{ STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file html = re.sub(rf'src="/book/images/.*{ filename }"', f'src="{ image_path }"', html) return html def clean_up(html): """ html = string (HTML) """ html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit] html = re.sub(r'href="/book/index.php\?title=', 'href="#', html) # remove the internal wiki links return html def parse_page(pagename, wiki): """ pagename = string html = string (HTML) """ parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json' data = API_request(parse, pagename) # print(json.dumps(data, indent=4)) if 'parse' in data: html = data['parse']['text']['*'] images = data['parse']['images'] html = download_media(html, images, wiki) html = clean_up(html) else: html = None return html def save(html, pagename, publication_unfolded): """ html = string (HTML) pagename = string """ if html: # save final page that will be used with PagedJS template_file = open(f'{ STATIC_FOLDER_PATH }/local/template.html').read() template = jinja2.Template(template_file) html = template.render(publication_unfolded=publication_unfolded, title=pagename) html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.html' print('Saving HTML:', html_file) with open(html_file, 'w') as out: out.write(html) out.close() # save extra html page for debugging template_file = open(f'{ STATIC_FOLDER_PATH }/local/template.inspect.html').read() template = jinja2.Template(template_file) html = template.render(publication_unfolded=publication_unfolded, title=pagename) html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.inspect.html' print('Saving HTML:', html_file) with open(html_file, 'w') as out: out.write(html) out.close() def update_material_now(pagename, wiki): """ pagename = string publication_unfolded = string (HTML) """ publication_unfolded = parse_page(pagename, wiki) return publication_unfolded # --- if __name__ == "__main__": wiki = 'https://possiblebodies.constantvzw.org/book' # remove tail slash '/' pagename = 'Unfolded' publication_unfolded = update_material_now(pagename, wiki) # download the latest version of the page save(publication_unfolded, pagename, publication_unfolded) # save the page to file