You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
432 lines
11 KiB
432 lines
11 KiB
from pprint import pprint
|
|
import sys
|
|
import urllib.request
|
|
import urllib.error
|
|
import os
|
|
import re
|
|
import json
|
|
import jinja2
|
|
import datetime
|
|
from bs4 import BeautifulSoup
|
|
|
|
STATIC_FOLDER_PATH = './static' # without trailing slash
|
|
PUBLIC_STATIC_FOLDER_PATH = '/static' # without trailing slash
|
|
TEMPLATES_DIR = None
|
|
|
|
# This uses a low quality copy of all the images
|
|
# (using a folder with the name "images-small",
|
|
# which stores a copy of all the images generated with:
|
|
# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )
|
|
|
|
fast = False
|
|
|
|
|
|
# gets or creates index of publications in namespace
|
|
|
|
def get_index(wiki, subject_ns):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
"""
|
|
return load_file('index', 'json') or create_index(
|
|
wiki,
|
|
subject_ns
|
|
)
|
|
|
|
|
|
# gets publication's HTML and CSS
|
|
|
|
def get_publication(wiki, subject_ns, styles_ns, pagename):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
styles_ns = object
|
|
pagename = string
|
|
"""
|
|
return {
|
|
'html': get_html(wiki, subject_ns, pagename),
|
|
'css': get_css(wiki, styles_ns, pagename)
|
|
}
|
|
|
|
|
|
# gets or creates HTML file for a publication
|
|
|
|
def get_html(wiki, subject_ns, pagename):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
pagename = string
|
|
"""
|
|
return load_file(pagename, 'html') or create_html(
|
|
wiki,
|
|
subject_ns,
|
|
pagename,
|
|
)
|
|
|
|
|
|
# gets or creates CSS file for a publication
|
|
|
|
def get_css(wiki, styles_ns, pagename):
|
|
"""
|
|
wiki = string
|
|
styles_ns = object
|
|
pagename = string
|
|
"""
|
|
return load_file(pagename, 'css') or create_css(
|
|
wiki,
|
|
styles_ns,
|
|
pagename
|
|
)
|
|
|
|
|
|
# makes API call to create/update index of publications
|
|
|
|
def create_index(wiki, subject_ns):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
"""
|
|
url = f'{ wiki }/api.php?action=query&format=json&list=allpages&apnamespace={ subject_ns["id"] }'
|
|
data = do_API_request(url)
|
|
pages = data['query']['allpages']
|
|
# exclude subpages
|
|
pages = [page for page in pages if '/' not in page['title']]
|
|
for page in pages:
|
|
# removing the namespace from title
|
|
page['title'] = page['title'].replace(subject_ns['name'] + ':', '')
|
|
page['slug'] = page['title'].replace(' ', '_') # slugifying title
|
|
pageJSON = load_file(page['slug'], 'json')
|
|
page['updated'] = pageJSON and pageJSON['updated'] or '--'
|
|
now = str(datetime.datetime.now())
|
|
index = {
|
|
'pages': pages,
|
|
'updated': now
|
|
}
|
|
save_file('index', 'json', index)
|
|
return index
|
|
|
|
|
|
# Creates/updates a publication object
|
|
|
|
def create_publication(wiki, subject_ns, styles_ns, pagename, full_update):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
styles_ns = object
|
|
pagename = string
|
|
"""
|
|
return {
|
|
'html': create_html(wiki, subject_ns, pagename, full_update),
|
|
'css': create_css(wiki, styles_ns, pagename)
|
|
}
|
|
|
|
|
|
# makes API call to create/update a publication's HTML
|
|
|
|
def create_html(wiki, subject_ns, pagename, full_update):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
pagename = string
|
|
full_update = None or string. Full update when not None
|
|
"""
|
|
url = f'{ wiki }/api.php?action=parse&page={ subject_ns["name"] }:{ pagename }&pst=True&format=json'
|
|
data = do_API_request(url, subject_ns["name"]+":"+pagename, wiki)
|
|
# pprint(data)
|
|
now = str(datetime.datetime.now())
|
|
data['updated'] = now
|
|
|
|
save_file(pagename, 'json', data)
|
|
|
|
update_publication_date( # we add the last updated of the publication to our index
|
|
wiki,
|
|
subject_ns,
|
|
pagename,
|
|
now
|
|
)
|
|
|
|
if 'parse' in data:
|
|
html = data['parse']['text']['*']
|
|
# pprint(html)
|
|
imgs = data['parse']['images']
|
|
|
|
html = remove_comments(html)
|
|
html = download_media(html, imgs, wiki, full_update)
|
|
html = clean_up(html)
|
|
# html = add_item_inventory_links(html)
|
|
|
|
if fast == True:
|
|
html = fast_loader(html)
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
soup = remove_edit(soup)
|
|
soup = inlineCiteRefs(soup)
|
|
html = str(soup)
|
|
# html = inlineCiteRefs(html)
|
|
# html = add_author_names_toc(html)
|
|
|
|
else:
|
|
html = None
|
|
|
|
save_file(pagename, 'html', html)
|
|
|
|
return html
|
|
|
|
|
|
# makes API call to create/update a publication's CSS
|
|
|
|
def create_css(wiki, styles_ns, pagename):
|
|
"""
|
|
wiki = string
|
|
styles_ns = object
|
|
pagename = string
|
|
"""
|
|
css_url = f'{ wiki }/api.php?action=parse&page={ styles_ns["name"] }:{ pagename }&prop=wikitext&pst=True&format=json'
|
|
css_data = do_API_request(css_url)
|
|
if css_data and 'parse' in css_data:
|
|
css = css_data['parse']['wikitext']['*']
|
|
save_file(pagename, 'css', css)
|
|
return css
|
|
|
|
|
|
# Load file from disk
|
|
|
|
def load_file(pagename, ext):
|
|
"""
|
|
pagename = string
|
|
ext = string
|
|
"""
|
|
path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'
|
|
if os.path.exists(path):
|
|
print(f'Loading { ext }:', path)
|
|
with open(path, 'r') as out:
|
|
if ext == 'json':
|
|
data = json.load(out)
|
|
else:
|
|
data = out.read()
|
|
out.close()
|
|
return data
|
|
|
|
|
|
# Save file to disk
|
|
|
|
def save_file(pagename, ext, data):
|
|
"""
|
|
pagename = string
|
|
ext = string
|
|
data = object
|
|
"""
|
|
path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'
|
|
print(f'Saving { ext }:', path)
|
|
try:
|
|
out = open(path, 'w')
|
|
except OSError:
|
|
print("Could not open/write file:", path)
|
|
sys.exit()
|
|
|
|
with out: #open(path, 'w') as out:
|
|
if ext == 'json':
|
|
out.write( json.dumps(data, indent = 2) )
|
|
else:
|
|
out.write( data )
|
|
out.close()
|
|
return data
|
|
|
|
|
|
# do API request and return JSON
|
|
|
|
def do_API_request(url, filename="", wiki=""):
|
|
"""
|
|
url = API request url (string)
|
|
data = { 'query':
|
|
'pages' :
|
|
pageid : {
|
|
'links' : {
|
|
'?' : '?'
|
|
'title' : 'pagename'
|
|
}
|
|
}
|
|
}
|
|
}
|
|
"""
|
|
purge(filename, wiki)
|
|
print('Loading from wiki: ', url)
|
|
response = urllib.request.urlopen(url)
|
|
response_type = response.getheader('Content-Type')
|
|
|
|
if response.status == 200 and "json" in response_type:
|
|
contents = response.read()
|
|
data = json.loads(contents)
|
|
return data
|
|
|
|
# api calls seem to be cached even when called with maxage
|
|
# So call purge before doing the api call.
|
|
# https://www.mediawiki.org/wiki/API:Purge
|
|
def purge(filename, wiki):
|
|
if(filename=="" or wiki==""): return
|
|
print("purge " + filename )
|
|
|
|
import requests
|
|
S = requests.Session()
|
|
URL = f'{ wiki }/api.php'
|
|
# url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
|
|
PARAMS = {
|
|
"action": "purge",
|
|
"titles": filename,
|
|
"format": "json",
|
|
"generator": "alltransclusions",
|
|
}
|
|
R = S.post(url=URL, params=PARAMS)
|
|
# DATA = R.text
|
|
|
|
# updates a publication's last updated feild in the index
|
|
|
|
def update_publication_date(wiki, subject_ns, pagename, updated):
|
|
"""
|
|
wiki = string
|
|
subject_ns = object
|
|
pagename = string
|
|
updated = string
|
|
"""
|
|
index = get_index(wiki, subject_ns)
|
|
for page in index['pages']:
|
|
if page['slug'] == pagename:
|
|
page['updated'] = updated
|
|
save_file('index', 'json', index)
|
|
|
|
def customTemplate(name):
|
|
path = "custom/%s.html" % name
|
|
if os.path.isfile(os.path.join(os.path.dirname(__file__), "templates/", path)):
|
|
return path
|
|
else:
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Beautiful soup seems to have a problem with some comments,
|
|
# so lets remove them before parsing.
|
|
|
|
def remove_comments( html ):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
pattern = r'(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)'
|
|
return re.sub(pattern, "", html)
|
|
|
|
|
|
# Downloading images referenced in the html
|
|
|
|
def download_media(html, images, wiki, full_update):
|
|
"""
|
|
html = string (HTML)
|
|
images = list of filenames (str)
|
|
"""
|
|
# check if 'images/' already exists
|
|
if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):
|
|
os.makedirs(f'{ STATIC_FOLDER_PATH }/images')
|
|
|
|
# download media files
|
|
for filename in images:
|
|
filename = filename.replace(' ', '_') # safe filenames
|
|
# check if the image is already downloaded
|
|
# if not, then download the file
|
|
if (not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }')) or full_update:
|
|
# first we search for the full filename of the image
|
|
url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
|
|
# url = f'{ wiki }/api.php?action=query&titles=File:{ filename }&format=json'
|
|
data = do_API_request(url)
|
|
# timestamp = data.query.pages.
|
|
|
|
# print(json.dumps(data, indent=2))
|
|
|
|
if data and data['query']['allimages']:
|
|
|
|
# we select the first search result
|
|
# (assuming that this is the image we are looking for)
|
|
image = data['query']['allimages'][0]
|
|
|
|
if image:
|
|
# then we download the image
|
|
image_url = image['url']
|
|
image_filename = image['name']
|
|
print('Downloading:', image_filename)
|
|
image_response = urllib.request.urlopen(image_url).read()
|
|
|
|
# and we save it as a file
|
|
image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'
|
|
out = open(image_path, 'wb')
|
|
out.write(image_response)
|
|
out.close()
|
|
print(image_path)
|
|
|
|
import time
|
|
time.sleep(3) # do not overload the server
|
|
|
|
# replace src links
|
|
e_filename = re.escape( filename ) # needed for filename with certain characters
|
|
image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
|
|
matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', html) # for debugging
|
|
# pprint(matches)
|
|
if matches:
|
|
html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', f'src=\"{ image_path }\"', html)
|
|
else:
|
|
matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', html) # for debugging
|
|
# print(matches, e_filename, html)
|
|
html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', f'src=\"{ image_path }\"', html)
|
|
print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!
|
|
|
|
return html
|
|
|
|
|
|
def clean_up(html):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
# html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit] # Heerko: this somehow caused problems. Removing it solves it, seeming without side effects...
|
|
html = re.sub(r'href="/index.php\?title=', 'href="#', html) # remove the internal wiki links
|
|
html = re.sub(r'[(?=\d)', '', html) # remove left footnote bracket [
|
|
html = re.sub(r'(?<=\d)]', '', html) # remove right footnote bracket ]
|
|
return html
|
|
|
|
def remove_edit(soup):
|
|
"""
|
|
soup = BeautifSoup (HTML)
|
|
"""
|
|
es = soup.find_all(class_="mw-editsection")
|
|
for s in es:
|
|
s.decompose()
|
|
return soup
|
|
|
|
|
|
# inline citation references in the html for pagedjs
|
|
# Turns: <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>
|
|
# into: <span class="footnote">The cite text</span>
|
|
|
|
def inlineCiteRefs(soup):
|
|
"""
|
|
soup = BeautifSoup (HTML)
|
|
"""
|
|
refs = soup.find_all("sup", class_="reference")
|
|
for ref in refs:
|
|
href = ref.a['href']
|
|
res = re.findall('[0-9]+', href)
|
|
if(res):
|
|
cite = soup.find_all(id="cite_note-"+res[0])
|
|
text = cite[0].find(class_="reference-text")
|
|
text['class'] = 'footnote'
|
|
ref.replace_with(text)
|
|
# remove the reference from the bottom of the document
|
|
for item in soup.find_all(class_="references"):
|
|
item.decompose()
|
|
return soup
|
|
|
|
|
|
def fast_loader(html):
|
|
"""
|
|
html = string (HTML)
|
|
"""
|
|
html = html.replace('/images/', '/images-small/')
|
|
print('--- rendered in FAST mode ---')
|
|
|
|
return html
|
|
|