You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

432 lines
11 KiB

from pprint import pprint
import sys
import urllib.request
import urllib.error
import os
import re
import json
import jinja2
import datetime
from bs4 import BeautifulSoup
STATIC_FOLDER_PATH = './static' # without trailing slash
PUBLIC_STATIC_FOLDER_PATH = '/static' # without trailing slash
TEMPLATES_DIR = None
# This uses a low quality copy of all the images
# (using a folder with the name "images-small",
# which stores a copy of all the images generated with:
# $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * )
fast = False
# gets or creates index of publications in namespace
def get_index(wiki, subject_ns):
"""
wiki = string
subject_ns = object
"""
return load_file('index', 'json') or create_index(
wiki,
subject_ns
)
# gets publication's HTML and CSS
def get_publication(wiki, subject_ns, styles_ns, pagename):
"""
wiki = string
subject_ns = object
styles_ns = object
pagename = string
"""
return {
'html': get_html(wiki, subject_ns, pagename),
'css': get_css(wiki, styles_ns, pagename)
}
# gets or creates HTML file for a publication
def get_html(wiki, subject_ns, pagename):
"""
wiki = string
subject_ns = object
pagename = string
"""
return load_file(pagename, 'html') or create_html(
wiki,
subject_ns,
pagename,
)
# gets or creates CSS file for a publication
def get_css(wiki, styles_ns, pagename):
"""
wiki = string
styles_ns = object
pagename = string
"""
return load_file(pagename, 'css') or create_css(
wiki,
styles_ns,
pagename
)
# makes API call to create/update index of publications
def create_index(wiki, subject_ns):
"""
wiki = string
subject_ns = object
"""
url = f'{ wiki }/api.php?action=query&format=json&list=allpages&apnamespace={ subject_ns["id"] }'
data = do_API_request(url)
pages = data['query']['allpages']
# exclude subpages
pages = [page for page in pages if '/' not in page['title']]
for page in pages:
# removing the namespace from title
page['title'] = page['title'].replace(subject_ns['name'] + ':', '')
page['slug'] = page['title'].replace(' ', '_') # slugifying title
pageJSON = load_file(page['slug'], 'json')
page['updated'] = pageJSON and pageJSON['updated'] or '--'
now = str(datetime.datetime.now())
index = {
'pages': pages,
'updated': now
}
save_file('index', 'json', index)
return index
# Creates/updates a publication object
def create_publication(wiki, subject_ns, styles_ns, pagename, full_update):
"""
wiki = string
subject_ns = object
styles_ns = object
pagename = string
"""
return {
'html': create_html(wiki, subject_ns, pagename, full_update),
'css': create_css(wiki, styles_ns, pagename)
}
# makes API call to create/update a publication's HTML
def create_html(wiki, subject_ns, pagename, full_update):
"""
wiki = string
subject_ns = object
pagename = string
full_update = None or string. Full update when not None
"""
url = f'{ wiki }/api.php?action=parse&page={ subject_ns["name"] }:{ pagename }&pst=True&format=json'
data = do_API_request(url, subject_ns["name"]+":"+pagename, wiki)
# pprint(data)
now = str(datetime.datetime.now())
data['updated'] = now
save_file(pagename, 'json', data)
update_publication_date( # we add the last updated of the publication to our index
wiki,
subject_ns,
pagename,
now
)
if 'parse' in data:
html = data['parse']['text']['*']
# pprint(html)
imgs = data['parse']['images']
html = remove_comments(html)
html = download_media(html, imgs, wiki, full_update)
html = clean_up(html)
# html = add_item_inventory_links(html)
if fast == True:
html = fast_loader(html)
soup = BeautifulSoup(html, 'html.parser')
soup = remove_edit(soup)
soup = inlineCiteRefs(soup)
html = str(soup)
# html = inlineCiteRefs(html)
# html = add_author_names_toc(html)
else:
html = None
save_file(pagename, 'html', html)
return html
# makes API call to create/update a publication's CSS
def create_css(wiki, styles_ns, pagename):
"""
wiki = string
styles_ns = object
pagename = string
"""
css_url = f'{ wiki }/api.php?action=parse&page={ styles_ns["name"] }:{ pagename }&prop=wikitext&pst=True&format=json'
css_data = do_API_request(css_url)
if css_data and 'parse' in css_data:
css = css_data['parse']['wikitext']['*']
save_file(pagename, 'css', css)
return css
# Load file from disk
def load_file(pagename, ext):
"""
pagename = string
ext = string
"""
path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'
if os.path.exists(path):
print(f'Loading { ext }:', path)
with open(path, 'r') as out:
if ext == 'json':
data = json.load(out)
else:
data = out.read()
out.close()
return data
# Save file to disk
def save_file(pagename, ext, data):
"""
pagename = string
ext = string
data = object
"""
path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }'
print(f'Saving { ext }:', path)
try:
out = open(path, 'w')
except OSError:
print("Could not open/write file:", path)
sys.exit()
with out: #open(path, 'w') as out:
if ext == 'json':
out.write( json.dumps(data, indent = 2) )
else:
out.write( data )
out.close()
return data
# do API request and return JSON
def do_API_request(url, filename="", wiki=""):
"""
url = API request url (string)
data = { 'query':
'pages' :
pageid : {
'links' : {
'?' : '?'
'title' : 'pagename'
}
}
}
}
"""
purge(filename, wiki)
print('Loading from wiki: ', url)
response = urllib.request.urlopen(url)
response_type = response.getheader('Content-Type')
if response.status == 200 and "json" in response_type:
contents = response.read()
data = json.loads(contents)
return data
# api calls seem to be cached even when called with maxage
# So call purge before doing the api call.
# https://www.mediawiki.org/wiki/API:Purge
def purge(filename, wiki):
if(filename=="" or wiki==""): return
print("purge " + filename )
import requests
S = requests.Session()
URL = f'{ wiki }/api.php'
# url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
PARAMS = {
"action": "purge",
"titles": filename,
"format": "json",
"generator": "alltransclusions",
}
R = S.post(url=URL, params=PARAMS)
# DATA = R.text
# updates a publication's last updated feild in the index
def update_publication_date(wiki, subject_ns, pagename, updated):
"""
wiki = string
subject_ns = object
pagename = string
updated = string
"""
index = get_index(wiki, subject_ns)
for page in index['pages']:
if page['slug'] == pagename:
page['updated'] = updated
save_file('index', 'json', index)
def customTemplate(name):
path = "custom/%s.html" % name
if os.path.isfile(os.path.join(os.path.dirname(__file__), "templates/", path)):
return path
else:
return None
# Beautiful soup seems to have a problem with some comments,
# so lets remove them before parsing.
def remove_comments( html ):
"""
html = string (HTML)
"""
pattern = r'(<!--.*?-->)|(<!--[\S\s]+?-->)|(<!--[\S\s]*?$)'
return re.sub(pattern, "", html)
# Downloading images referenced in the html
def download_media(html, images, wiki, full_update):
"""
html = string (HTML)
images = list of filenames (str)
"""
# check if 'images/' already exists
if not os.path.exists(f'{ STATIC_FOLDER_PATH }/images'):
os.makedirs(f'{ STATIC_FOLDER_PATH }/images')
# download media files
for filename in images:
filename = filename.replace(' ', '_') # safe filenames
# check if the image is already downloaded
# if not, then download the file
if (not os.path.isfile(f'{ STATIC_FOLDER_PATH }/images/{ filename }')) or full_update:
# first we search for the full filename of the image
url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json'
# url = f'{ wiki }/api.php?action=query&titles=File:{ filename }&format=json'
data = do_API_request(url)
# timestamp = data.query.pages.
# print(json.dumps(data, indent=2))
if data and data['query']['allimages']:
# we select the first search result
# (assuming that this is the image we are looking for)
image = data['query']['allimages'][0]
if image:
# then we download the image
image_url = image['url']
image_filename = image['name']
print('Downloading:', image_filename)
image_response = urllib.request.urlopen(image_url).read()
# and we save it as a file
image_path = f'{ STATIC_FOLDER_PATH }/images/{ image_filename }'
out = open(image_path, 'wb')
out.write(image_response)
out.close()
print(image_path)
import time
time.sleep(3) # do not overload the server
# replace src links
e_filename = re.escape( filename ) # needed for filename with certain characters
image_path = f'{ PUBLIC_STATIC_FOLDER_PATH }/images/{ filename }' # here the images need to link to the / of the domain, for flask :/// confusing! this breaks the whole idea to still be able to make a local copy of the file
matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', html) # for debugging
# pprint(matches)
if matches:
html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?px-{ e_filename }\"', f'src=\"{ image_path }\"', html)
else:
matches = re.findall(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', html) # for debugging
# print(matches, e_filename, html)
html = re.sub(rf'src=\"/wiki/mediawiki/images/.*?{ e_filename }\"', f'src=\"{ image_path }\"', html)
print(f'{filename}: {matches}\n------') # for debugging: each image should have the correct match!
return html
def clean_up(html):
"""
html = string (HTML)
"""
# html = re.sub(r'\[.*edit.*\]', '', html) # remove the [edit] # Heerko: this somehow caused problems. Removing it solves it, seeming without side effects...
html = re.sub(r'href="/index.php\?title=', 'href="#', html) # remove the internal wiki links
html = re.sub(r'&#91;(?=\d)', '', html) # remove left footnote bracket [
html = re.sub(r'(?<=\d)&#93;', '', html) # remove right footnote bracket ]
return html
def remove_edit(soup):
"""
soup = BeautifSoup (HTML)
"""
es = soup.find_all(class_="mw-editsection")
for s in es:
s.decompose()
return soup
# inline citation references in the html for pagedjs
# Turns: <sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup>
# into: <span class="footnote">The cite text</span>
def inlineCiteRefs(soup):
"""
soup = BeautifSoup (HTML)
"""
refs = soup.find_all("sup", class_="reference")
for ref in refs:
href = ref.a['href']
res = re.findall('[0-9]+', href)
if(res):
cite = soup.find_all(id="cite_note-"+res[0])
text = cite[0].find(class_="reference-text")
text['class'] = 'footnote'
ref.replace_with(text)
# remove the reference from the bottom of the document
for item in soup.find_all(class_="references"):
item.decompose()
return soup
def fast_loader(html):
"""
html = string (HTML)
"""
html = html.replace('/images/', '/images-small/')
print('--- rendered in FAST mode ---')
return html