wiki-to-print/wiki-to-print/api.py


								from pprint import pprint

								import sys

								import urllib.request

								import urllib.error

								import os

								import re

								import json

								import jinja2

								import datetime

								from bs4 import BeautifulSoup


								STATIC_FOLDER_PATH = './static'        # without trailing slash

								PUBLIC_STATIC_FOLDER_PATH = '/static'  # without trailing slash

								TEMPLATES_DIR = None


								# This uses a low quality copy of all the images

								# (using a folder with the name "images-small",

								# which stores a copy of all the images generated with:

								# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )


								fast = False


								# gets or creates index of publications in namespace


								def get_index(wiki, subject_ns):

									"""

										wiki = string

										subject_ns = object

									"""

									return load_file('index', 'json') or create_index(

										wiki,

										subject_ns

									)


								# gets publication's HTML and CSS


								def get_publication(wiki, subject_ns, styles_ns, pagename):

									"""

										wiki = string

										subject_ns = object

										styles_ns = object

										pagename = string

									"""

									return {

										'html': get_html(wiki, subject_ns, pagename),

										'css': get_css(wiki, styles_ns, pagename)

									}


								# gets or creates HTML file for a publication


								def get_html(wiki, subject_ns, pagename):

									"""

										wiki = string

										subject_ns = object

										pagename = string

									"""

									return load_file(pagename, 'html') or create_html(

										wiki,

										subject_ns,

										pagename,

									)


								# gets or creates CSS file for a publication


								def get_css(wiki, styles_ns, pagename):

									"""

										wiki = string

										styles_ns = object

										pagename = string

									"""

									return load_file(pagename, 'css') or create_css(

										wiki,

										styles_ns,

										pagename

									)


								# makes API call to create/update index of publications


								def create_index(wiki, subject_ns):

									"""

										wiki = string

										subject_ns = object

									"""

									url = f'{ wiki }/api.php?action=query&format=json&list=allpages&apnamespace={ subject_ns["id"] }'

									data = do_API_request(url)

									pages = data['query']['allpages']

									# exclude subpages

									pages = [page for page in pages if '/' not in page['title']]

									for page in pages:

										# removing the namespace from title

										page['title'] = page['title'].replace(subject_ns['name'] + ':', '')

										page['slug'] = page['title'].replace(' ', '_')  # slugifying title

										pageJSON = load_file(page['slug'], 'json')

										page['updated'] = pageJSON and pageJSON['updated'] or '--'

									now = str(datetime.datetime.now())

									index = {

										'pages': pages,

										'updated': now

									}

									save_file('index', 'json', index)

									return index


								# Creates/updates a publication object


								def create_publication(wiki, subject_ns, styles_ns, pagename, full_update):

									"""

										wiki = string

										subject_ns = object

										styles_ns = object

										pagename = string

									"""

									return {

										'html': create_html(wiki, subject_ns, pagename, full_update),

										'css': create_css(wiki, styles_ns, pagename)

									}


								# makes API call to create/update a publication's HTML


								def create_html(wiki, subject_ns, pagename, full_update):

									"""

										wiki = string

										subject_ns = object

										pagename = string

										full_update = None or string. Full update when not None

									"""

									url = f'{ wiki }/api.php?action=parse&page={ subject_ns["name"] }:{ pagename }&pst=True&format=json'

									data = do_API_request(url, subject_ns["name"]+":"+pagename, wiki)

									# pprint(data)

									now = str(datetime.datetime.now())

									data['updated'] = now


									save_file(pagename, 'json', data)


									update_publication_date(   # we add the last updated of the publication to our index

										wiki,

										subject_ns,

										pagename,

										now

									)


									if 'parse' in data:

										html = data['parse']['text']['*']

										# pprint(html)

										imgs = data['parse']['images']


										html = remove_comments(html)

										html = download_media(html, imgs, wiki, full_update)

										html = clean_up(html)

										# html = add_item_inventory_links(html)


										if fast == True:

											html = fast_loader(html)


										soup = BeautifulSoup(html, 'html.parser')

										soup = remove_edit(soup)

										soup = inlineCiteRefs(soup)

										html = str(soup)

										# html = inlineCiteRefs(html)

										# html = add_author_names_toc(html)


									else:

										html = None


									save_file(pagename, 'html', html)


									return html


								# makes API call to create/update a publication's CSS


								def create_css(wiki, styles_ns, pagename):

									"""

										wiki = string

										styles_ns = object

										pagename = string

									"""

									css_url = f'{ wiki }/api.php?action=parse&page={ styles_ns["name"] }:{ pagename }&prop=wikitext&pst=True&format=json'

									css_data = do_API_request(css_url)

									if css_data and 'parse' in css_data:

										css = css_data['parse']['wikitext']['*']

										save_file(pagename, 'css', css)

										return css


								# Load file from disk


								def load_file(pagename, ext):

									"""

										pagename = string

										ext = string

									"""

									path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'

									if os.path.exists(path):

										print(f'Loading { ext }:', path)

										with open(path, 'r') as out:

											if ext == 'json':

												data = json.load(out)

											else:

												data = out.read()

											out.close()

										return data


								# Save file to disk


								def save_file(pagename, ext, data):

									"""

										pagename = string

										ext = string

										data = object

									"""

									path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'

									print(f'Saving { ext }:', path)

									try:

										out = open(path, 'w')

									except OSError:

											print("Could not open/write file:", path)

											sys.exit()


									with out: #open(path, 'w') as out:

										if ext == 'json':

											out.write( json.dumps(data, indent = 2) )

										else:

											out.write( data )

										out.close()

									return data


								# do API request and return JSON


								def do_API_request(url, filename="", wiki=""):

									"""

										url = API request url (string)

										data =  { 'query':

													'pages' :

														pageid : {

															'links' : {

																'?' : '?'

																'title' : 'pagename'

															}

														}

													}

												}

									"""

									purge(filename, wiki)

									print('Loading from wiki: ', url)

									response = urllib.request.urlopen(url)

									response_type = response.getheader('Content-Type')


									if response.status == 200 and "json" in response_type:

										contents = response.read()

										data = json.loads(contents)

										return data


								# api calls seem to be cached even when called with maxage

								# So call purge before doing the api call.

								# https://www.mediawiki.org/wiki/API:Purge

								def purge(filename, wiki):

									if(filename=="" or wiki==""): return

									print("purge " + filename )


									import requests

									S = requests.Session()

									URL = f'{ wiki }/api.php'

									# url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'

									PARAMS = {

											"action": "purge",

											"titles": filename,

											"format": "json",

											"generator": "alltransclusions",

									}

									R = S.post(url=URL, params=PARAMS)

									# DATA = R.text


								# updates a publication's last updated feild in the index


								def update_publication_date(wiki, subject_ns, pagename, updated):

									"""

										wiki = string

										subject_ns = object

										pagename = string

										updated = string

									"""

									index = get_index(wiki, subject_ns)

									for page in index['pages']:

										if page['slug'] == pagename:

											page['updated'] = updated

									save_file('index', 'json', index)


								def customTemplate(name):

									path = "custom/%s.html" % name

									if os.path.isfile(os.path.join(os.path.dirname(__file__), "templates/", path)):

										return path

									else:

										return None


								# Beautiful soup seems to have a problem with some comments,

								# so lets remove them before parsing.


								def remove_comments( html ):

									"""

										html = string (HTML)

									"""

									pattern = r'(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)'

									return re.sub(pattern, "", html)


								# Downloading images referenced in the html


								def download_media(html, images, wiki, full_update):

									"""

										html = string (HTML)

										images = list of filenames (str)

									"""

									# check if 'images/' already exists

									if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):

										os.makedirs(f'{ STATIC_FOLDER_PATH }/images')


									# download media files

									for filename in images:

										filename = filename.replace(' ', '_') # safe filenames

										# check if the image is already downloaded

										# if not, then download the file

										if (not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }')) or full_update:

											# first we search for the full filename of the image

											url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'

											# url = f'{ wiki }/api.php?action=query&titles=File:{ filename }&format=json'

											data = do_API_request(url)

											# timestamp = data.query.pages.


											# print(json.dumps(data, indent=2))


											if data and data['query']['allimages']:


											# we select the first search result

											# (assuming that this is the image we are looking for)

												image = data['query']['allimages'][0]


												if image:

													# then we download the image

													image_url = image['url']

													image_filename = image['name']

													print('Downloading:', image_filename)

													image_response = urllib.request.urlopen(image_url).read()


													# and we save it as a file

													image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'

													out = open(image_path, 'wb')

													out.write(image_response)

													out.close()

													print(image_path)


													import time

													time.sleep(3) # do not overload the server


										# replace src links

										e_filename = re.escape( filename )  # needed for filename with certain characters

										image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file

										matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', html) # for debugging

										# pprint(matches)

										if matches:

											html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', f'src=\"{ image_path }\"', html)

										else:

											matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', html) # for debugging

											# print(matches, e_filename, html)

											html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', f'src=\"{ image_path }\"', html)

										print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!


									return html


								def clean_up(html):

									"""

										html = string (HTML)

									"""

									# html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit] # Heerko: this somehow caused problems. Removing it solves it, seeming without side effects...

									html = re.sub(r'href="/index.php\?title=', 'href="#', html) # remove the internal wiki links

									html = re.sub(r'&#91;(?=\d)', '', html) # remove left footnote bracket [

									html = re.sub(r'(?<=\d)&#93;', '', html) # remove right footnote bracket ]

									return html


								def remove_edit(soup):

									"""

										soup = BeautifSoup (HTML)

									"""

									es = soup.find_all(class_="mw-editsection")

									for s in es:

										s.decompose()

									return soup


								# inline citation references in the html for pagedjs

								# Turns: <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>

								# into: <span class="footnote">The cite text</span>


								def inlineCiteRefs(soup):

									"""

										soup = BeautifSoup (HTML)

									"""

									refs = soup.find_all("sup", class_="reference")

									for ref in refs:

										href = ref.a['href']

										res = re.findall('[0-9]+', href)

										if(res):

											cite = soup.find_all(id="cite_note-"+res[0])

											text = cite[0].find(class_="reference-text")

											text['class'] = 'footnote'

											ref.replace_with(text)

									# remove the  reference from the bottom of the document

									for item in soup.find_all(class_="references"):

										item.decompose()

									return soup


								def fast_loader(html):

									"""

										html = string (HTML)

									"""

									html = html.replace('/images/', '/images-small/')

									print('--- rendered in FAST mode ---')


									return html