You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
226 lines
6.6 KiB
226 lines
6.6 KiB
import urllib.request
|
|
import os
|
|
import re
|
|
import json
|
|
import jinja2
|
|
|
|
STATIC_FOLDER_PATH = '.' # without trailing slash
|
|
PUBLIC_STATIC_FOLDER_PATH = '.' # without trailing slash
|
|
TEMPLATES_DIR = './templates'
|
|
|
|
# This uses a low quality copy of all the images
|
|
# (using a folder with the name "images-small",
|
|
# which stores a copy of all the images generated with:
|
|
# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )
|
|
fast = False
|
|
|
|
def API_request(url, pagename):
|
|
"""
|
|
url = API request url (string)
|
|
data = { 'query':
|
|
'pages' :
|
|
pageid : {
|
|
'links' : {
|
|
'?' : '?'
|
|
'title' : 'pagename'
|
|
}
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
response = urllib.request.urlopen(url).read()
|
|
data = json.loads(response)
|
|
|
|
# Save response as JSON to be able to inspect API call
|
|
json_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.json'
|
|
print('Saving JSON:', json_file)
|
|
with open(json_file, 'w') as out:
|
|
out.write(json.dumps(data, indent=4))
|
|
out.close()
|
|
|
|
return data
|
|
|
|
def download_media(html, images, wiki):
|
|
"""
|
|
html = string (HTML)
|
|
images = list of filenames (str)
|
|
"""
|
|
# check if 'images/' already exists
|
|
if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):
|
|
os.makedirs(f'{ STATIC_FOLDER_PATH }/images')
|
|
|
|
# download media files
|
|
for filename in images:
|
|
filename = filename.replace(' ', '_') # safe filenames
|
|
|
|
# check if the image is already downloaded
|
|
# if not, then download the file
|
|
if not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }'):
|
|
|
|
# first we search for the full filename of the image
|
|
url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
|
|
response = urllib.request.urlopen(url).read()
|
|
data = json.loads(response)
|
|
|
|
# we select the first search result
|
|
# (assuming that this is the image we are looking for)
|
|
image = data['query']['allimages'][0]
|
|
|
|
# then we download the image
|
|
image_url = image['url']
|
|
image_filename = image['name']
|
|
print('Downloading:', image_filename)
|
|
image_response = urllib.request.urlopen(image_url).read()
|
|
|
|
# and we save it as a file
|
|
image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'
|
|
out = open(image_path, 'wb')
|
|
out.write(image_response)
|
|
out.close()
|
|
|
|
import time
|
|
time.sleep(3) # do not overload the server
|
|
|
|
# replace src link
|
|
image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
|
|
matches = re.findall(rf'src="/images/.*?px-{ filename }"', html) # for debugging
|
|
if matches:
|
|
html = re.sub(rf'src="/images/.*?px-{ filename }"', f'src="{ image_path }"', html)
|
|
else:
|
|
matches = re.findall(rf'src="/images/.*?{ filename }"', html) # for debugging
|
|
html = re.sub(rf'src="/images/.*?{ filename }"', f'src="{ image_path }"', html)
|
|
# print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!
|
|
|
|
return html
|
|
|
|
def add_item_inventory_links(html):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
# Find all references in the text to the item index
|
|
pattern = r'Item \d\d\d'
|
|
matches = re.findall(pattern, html)
|
|
index = {}
|
|
new_html = ''
|
|
from nltk.tokenize import sent_tokenize
|
|
for line in sent_tokenize(html):
|
|
for match in matches:
|
|
if match in line:
|
|
number = match.replace('Item ', '').strip()
|
|
if not number in index:
|
|
index[number] = []
|
|
count = 1
|
|
else:
|
|
count = index[number][-1] + 1
|
|
index[number].append(count)
|
|
item_id = f'ii-{ number }-{ index[number][-1] }'
|
|
line = line.replace(match, f'Item <a id="{ item_id }" href="#Item_Index">{ number }</a>')
|
|
|
|
# the line is pushed back to the new_html
|
|
new_html += line + ' '
|
|
|
|
# Also add a <span> around the index nr to style it
|
|
matches = re.findall(r'<li>\d\d\d', new_html)
|
|
for match in matches:
|
|
new_html = new_html.replace(match, f'<li><span class="item_nr">{ match }</span>')
|
|
|
|
# import json
|
|
# print(json.dumps(index, indent=4))
|
|
|
|
return new_html
|
|
|
|
def clean_up(html):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit]
|
|
html = re.sub(r'href="/index.php\?title=', 'href="#', html) # remove the internal wiki links
|
|
html = re.sub(r'[(?=\d)', '', html) # remove left footnote bracket [
|
|
html = re.sub(r'(?<=\d)]', '', html) # remove right footnote bracket ]
|
|
return html
|
|
|
|
def fast_loader(html):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
if fast == True:
|
|
html = html.replace('/images/', '/images-small/')
|
|
print('--- rendered in FAST mode ---')
|
|
|
|
return html
|
|
|
|
def parse_page(pagename, wiki):
|
|
"""
|
|
pagename = string
|
|
html = string (HTML)
|
|
"""
|
|
parse = f'{ wiki }/api.php?action=parse&page={ pagename }&pst=True&format=json'
|
|
data = API_request(parse, pagename)
|
|
# print(json.dumps(data, indent=4))
|
|
if 'parse' in data:
|
|
html = data['parse']['text']['*']
|
|
images = data['parse']['images']
|
|
html = download_media(html, images, wiki)
|
|
html = clean_up(html)
|
|
html = add_item_inventory_links(html)
|
|
html = fast_loader(html)
|
|
else:
|
|
html = None
|
|
|
|
return html
|
|
|
|
def save(html, pagename):
|
|
"""
|
|
html = string (HTML)
|
|
pagename = string
|
|
"""
|
|
if __name__ == "__main__":
|
|
# command-line
|
|
|
|
# save final page that will be used with PagedJS
|
|
template_file = open(f'{ STATIC_FOLDER_PATH }/{ TEMPLATES_DIR }/template.html').read()
|
|
template = jinja2.Template(template_file)
|
|
doc = template.render(publication_unfolded=html, title=pagename)
|
|
|
|
html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.html'
|
|
print('Saving HTML:', html_file)
|
|
with open(html_file, 'w') as out:
|
|
out.write(doc)
|
|
out.close()
|
|
|
|
# save extra html page for debugging (CLI only)
|
|
template_file = open(f'{ STATIC_FOLDER_PATH }/{ TEMPLATES_DIR }/template.inspect.html').read()
|
|
template = jinja2.Template(template_file)
|
|
doc = template.render(publication_unfolded=html, title=pagename)
|
|
|
|
html_file = f'{ STATIC_FOLDER_PATH }/{ pagename }.inspect.html'
|
|
print('Saving HTML:', html_file)
|
|
with open(html_file, 'w') as out:
|
|
out.write(doc)
|
|
out.close()
|
|
|
|
else:
|
|
# Flask application
|
|
|
|
with open(f'{ STATIC_FOLDER_PATH }/Unfolded.html', 'w') as out:
|
|
out.write(html) # save the html to a file (without <head>)
|
|
|
|
def update_material_now(pagename, wiki):
|
|
"""
|
|
pagename = string
|
|
publication_unfolded = string (HTML)
|
|
"""
|
|
publication_unfolded = parse_page(pagename, wiki)
|
|
|
|
return publication_unfolded
|
|
|
|
# ---
|
|
|
|
if __name__ == "__main__":
|
|
|
|
wiki = 'https://example.com/wiki' # no tail slash '/'
|
|
pagename = 'Unfolded'
|
|
|
|
publication_unfolded = update_material_now(pagename, wiki) # download the latest version of the page
|
|
save(publication_unfolded, pagename) # save the page to file
|
|
|
|
|