@ -4,7 +4,10 @@ import re
import json
import jinja2
def API_request ( url ) :
STATIC_FOLDER_PATH = ' . ' # without trailing slash
WRAPPING_TEMPLATES_DIR = ' ./templates '
def API_request ( url , pagename ) :
"""
url = API request url ( string )
data = { ' query ' :
@ -22,7 +25,7 @@ def API_request(url):
data = json . loads ( response )
# Save response as JSON to be able to inspect API call
json_file = f ' { pagename } .json '
json_file = f ' { STATIC_FOLDER_PATH } / { pagename } .json '
print ( ' Saving JSON: ' , json_file )
with open ( json_file , ' w ' ) as out :
out . write ( json . dumps ( data , indent = 4 ) )
@ -30,14 +33,14 @@ def API_request(url):
return data
def download_media ( html , images ) :
def download_media ( html , images , wiki ) :
"""
html = string ( HTML )
images = list of filenames ( str )
"""
# check if 'images/' already exists
if not os . path . exists ( ' images' ) :
os . makedirs ( ' images' )
if not os . path . exists ( f ' { STATIC_FOLDER_PATH } / images' ) :
os . makedirs ( f ' { STATIC_FOLDER_PATH } / images' )
# download media files
for filename in images :
@ -45,7 +48,7 @@ def download_media(html, images):
# check if the image is already downloaded
# if not, then download the file
if not os . path . isfile ( f ' images/{ filename } ' ) :
if not os . path . isfile ( f ' { STATIC_FOLDER_PATH } / images/{ filename } ' ) :
# first we search for the full filename of the image
url = f ' { wiki } /api.php?action=query&list=allimages&aifrom= { filename } &format=json '
@ -63,7 +66,7 @@ def download_media(html, images):
image_response = urllib . request . urlopen ( image_url ) . read ( )
# and we save it as a file
image_path = f ' images/{ image_filename } '
image_path = f ' { STATIC_FOLDER_PATH } / images/{ image_filename } '
out = open ( image_path , ' wb ' )
out . write ( image_response )
out . close ( )
@ -72,7 +75,7 @@ def download_media(html, images):
time . sleep ( 3 ) # do not overload the server
# replace src link
image_path = f ' images/ { filename } '
image_path = f ' / { STATIC_FOLDER_PATH } / images/{ filename } ' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
html = re . sub ( rf ' src= " /book/images/.* { filename } " ' , f ' src= " { image_path } " ' , html )
return html
@ -82,28 +85,28 @@ def clean_up(html):
html = string ( HTML )
"""
html = re . sub ( r ' \ [.*edit.* \ ] ' , ' ' , html ) # remove the [edit]
html = re . sub ( r ' href= " /book/index.php?title=.*? " ' , ' href= " # " ' , html ) # remove the internal links
html = re . sub ( r ' href= " /book/index.php \ ?title= ' , ' href= " # ' , html ) # remove the internal wiki links
return html
def parse_page ( pagename ) :
def parse_page ( pagename , wiki ) :
"""
pagename = string
html = string ( HTML )
"""
parse = f ' { wiki } /api.php?action=parse&page= { pagename } &pst=True&format=json '
data = API_request ( parse )
data = API_request ( parse , pagename )
# print(json.dumps(data, indent=4))
if ' parse ' in data :
html = data [ ' parse ' ] [ ' text ' ] [ ' * ' ]
images = data [ ' parse ' ] [ ' images ' ]
html = download_media ( html , images )
html = download_media ( html , images , wiki )
html = clean_up ( html )
else :
html = None
return html
def save ( html , pagename ) :
def save ( html , pagename , publication_unfolded ) :
"""
html = string ( HTML )
pagename = string
@ -111,41 +114,43 @@ def save(html, pagename):
if html :
# save final page that will be used with PagedJS
template_file = open ( ' template.html' ) . read ( )
template_file = open ( f ' { STATIC_FOLDER_PATH } / { WRAPPING_TEMPLATES_DIR } / template.html' ) . read ( )
template = jinja2 . Template ( template_file )
html = template . render ( publication_unfolded = publication_unfolded , title = pagename )
html_file = f ' { pagename } .html '
html_file = f ' { STATIC_FOLDER_PATH } / { pagename } .html '
print ( ' Saving HTML: ' , html_file )
with open ( html_file , ' w ' ) as out :
out . write ( html )
out . close ( )
# save extra html page for debugging
template_file = open ( ' template.debug .html' ) . read ( )
template_file = open ( f ' { STATIC_FOLDER_PATH } / { WRAPPING_TEMPLATES_DIR } /template.inspect .html' ) . read ( )
template = jinja2 . Template ( template_file )
html = template . render ( publication_unfolded = publication_unfolded , title = pagename )
html_file = f ' { pagename } .debug .html '
html_file = f ' { STATIC_FOLDER_PATH } / { pagename } .inspect .html '
print ( ' Saving HTML: ' , html_file )
with open ( html_file , ' w ' ) as out :
out . write ( html )
out . close ( )
def update_material_now ( pagename ) :
def update_material_now ( pagename , wiki ) :
"""
pagename = string
publication_unfolded = string ( HTML )
"""
publication_unfolded = parse_page ( pagename )
publication_unfolded = parse_page ( pagename , wiki )
return publication_unfolded
# ---
wiki = ' https://possiblebodies.constantvzw.org/book ' # remove tail slash '/'
pagename = ' Unfolded '
if __name__ == " __main__ " :
publication_unfolded = update_material_now ( pagename ) # download the latest version of the page
save ( publication_unfolded , pagename ) # save the page to file
wiki = ' https://possiblebodies.constantvzw.org/book ' # remove tail slash '/'
pagename = ' Unfolded '
publication_unfolded = update_material_now ( pagename , wiki ) # download the latest version of the page
save ( publication_unfolded , pagename , publication_unfolded ) # save the page to file