import urllib.request import os import re import json import jinja2 STATIC_FOLDER_PATH = './static' # without trailing slash PUBLIC_STATIC_FOLDER_PATH = '/static' # without trailing slash TEMPLATES_DIR = None # This uses a low quality copy of all the images # (using a folder with the name "images-small", # which stores a copy of all the images generated with: # $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * ) fast = False def API_request(url, pagename): """ url = API request url (string) data = { 'query': 'pages' : pageid : { 'links' : { '?' : '?' 'title' : 'pagename' } } } } """ response = urllib.request.urlopen(url).read() data = json.loads(response) # Save response as JSON to be able to inspect API call json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json' print('Saving JSON:', json_file) with open(json_file, 'w') as out: out.write(json.dumps(data, indent=4)) out.close() return data def download_media(html, images, wiki): """ html = string (HTML) images = list of filenames (str) """ # check if 'images/' already exists if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'): os.makedirs(f'{ STATIC_FOLDER_PATH }/images') # tmp list for filename replacements replaced = [] images.sort() images.reverse() # reverse to make sure that 01.png does not override Image01.png in the filename replacements later # download media files for filename in images: filename = filename.replace(' ', '_') # safe filenames # check if the image is already downloaded # if not, then download the file if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'): # first we search for the full filename of the image url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json' response = urllib.request.urlopen(url).read() data = json.loads(response) # we select the first search result # (assuming that this is the image we are looking for) image = data['query']['allimages'][0] # then we download the image image_url = image['url'] image_filename = image['name'] print('Downloading:', image_filename) image_response = urllib.request.urlopen(image_url).read() # and we save it as a file image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }' out = open(image_path, 'wb') out.write(image_response) out.close() import time time.sleep(3) # do not overload the server # replace src image link (from wiki folder structure to local folder) image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file img_path_patterns = [rf'(? { image_path }') # for debugging: each image should have the correct match! html = html.replace(match, image_path) replaced.append(match) # else: # print(' already replaced!') # print('\n------\n') # break # else: # print(' no match!') # print('\n------\n') return html def add_item_inventory_links(html): """ html = string (HTML) """ # THROUGHOUT THE BOOK # Find all references in the text to the item index matches = re.findall(r'\w.*?Item \d\d\d.*?\w\w\w', html) # Dodgy attempt to find unique patterns for each mentioning of Item ### index = {} for match in matches: item_match = re.search(r'Item \d\d\d', match) item = item_match.group() number = item.replace('Item ', '').strip() text = match.replace(f'Item { number }', '') if not number in index: index[number] = [] count = 1 else: count = index[number][-1] + 1 index[number].append(count) item_id = f'ii-{ number }-{ index[number][-1] }' print(f'match: { number } --> { item_id } --> { text }') html = html.replace(match, f'Item { number }{ text }') # IN THE ITEM INDEX # Also add a around the index nr to style it matches = re.findall(r'
  • \d\d\d', html) for match in matches: html = html.replace(match, f'
  • { match }') print("\n-------------\n") print("The following items ('###') appear [#, #, ...] many times in the book:\n") sorted_index = dict(sorted(index.items())) print(sorted_index) print("\n-------------\n") return html def tweaking(html): """ html = string (HTML) """ html = html.replace('References