from pprint import pprint import sys import urllib.request import urllib.error import os import re import json import jinja2 import datetime from bs4 import BeautifulSoup STATIC_FOLDER_PATH = './static' # without trailing slash PUBLIC_STATIC_FOLDER_PATH = '/static' # without trailing slash TEMPLATES_DIR = None # This uses a low quality copy of all the images # (using a folder with the name "images-small", # which stores a copy of all the images generated with: # $ mogrify -quality 5% -adaptive-resize 25% -remap pattern:gray50 * ) fast = False # gets or creates index of publications in namespace def get_index(wiki, subject_ns): """ wiki = string subject_ns = object """ return load_file('index', 'json') or create_index( wiki, subject_ns ) # gets publication's HTML and CSS def get_publication(wiki, subject_ns, styles_ns, pagename): """ wiki = string subject_ns = object styles_ns = object pagename = string """ return { 'html': get_html(wiki, subject_ns, pagename), 'css': get_css(wiki, styles_ns, pagename) } # gets or creates HTML file for a publication def get_html(wiki, subject_ns, pagename): """ wiki = string subject_ns = object pagename = string """ return load_file(pagename, 'html') or create_html( wiki, subject_ns, pagename, ) # gets or creates CSS file for a publication def get_css(wiki, styles_ns, pagename): """ wiki = string styles_ns = object pagename = string """ return load_file(pagename, 'css') or create_css( wiki, styles_ns, pagename ) # makes API call to create/update index of publications def create_index(wiki, subject_ns): """ wiki = string subject_ns = object """ url = f'{ wiki }/api.php?action=query&format=json&list=allpages&apnamespace={ subject_ns["id"] }' data = do_API_request(url) pages = data['query']['allpages'] # exclude subpages pages = [page for page in pages if '/' not in page['title']] for page in pages: # removing the namespace from title page['title'] = page['title'].replace(subject_ns['name'] + ':', '') page['slug'] = page['title'].replace(' ', '_') # slugifying title pageJSON = load_file(page['slug'], 'json') page['updated'] = pageJSON and pageJSON['updated'] or '--' now = str(datetime.datetime.now()) index = { 'pages': pages, 'updated': now } save_file('index', 'json', index) return index # Creates/updates a publication object def create_publication(wiki, subject_ns, styles_ns, pagename, full_update): """ wiki = string subject_ns = object styles_ns = object pagename = string """ return { 'html': create_html(wiki, subject_ns, pagename, full_update), 'css': create_css(wiki, styles_ns, pagename) } # makes API call to create/update a publication's HTML def create_html(wiki, subject_ns, pagename, full_update): """ wiki = string subject_ns = object pagename = string full_update = None or string. Full update when not None """ url = f'{ wiki }/api.php?action=parse&page={ subject_ns["name"] }:{ pagename }&pst=True&format=json' data = do_API_request(url, subject_ns["name"]+":"+pagename, wiki) # pprint(data) now = str(datetime.datetime.now()) data['updated'] = now save_file(pagename, 'json', data) update_publication_date( # we add the last updated of the publication to our index wiki, subject_ns, pagename, now ) if 'parse' in data: html = data['parse']['text']['*'] # pprint(html) imgs = data['parse']['images'] html = remove_comments(html) html = download_media(html, imgs, wiki, full_update) html = clean_up(html) # html = add_item_inventory_links(html) if fast == True: html = fast_loader(html) soup = BeautifulSoup(html, 'html.parser') soup = remove_edit(soup) soup = inlineCiteRefs(soup) html = str(soup) # html = inlineCiteRefs(html) # html = add_author_names_toc(html) else: html = None save_file(pagename, 'html', html) return html # makes API call to create/update a publication's CSS def create_css(wiki, styles_ns, pagename): """ wiki = string styles_ns = object pagename = string """ css_url = f'{ wiki }/api.php?action=parse&page={ styles_ns["name"] }:{ pagename }&prop=wikitext&pst=True&format=json' css_data = do_API_request(css_url) if css_data and 'parse' in css_data: css = css_data['parse']['wikitext']['*'] save_file(pagename, 'css', css) return css # Load file from disk def load_file(pagename, ext): """ pagename = string ext = string """ path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }' if os.path.exists(path): print(f'Loading { ext }:', path) with open(path, 'r') as out: if ext == 'json': data = json.load(out) else: data = out.read() out.close() return data # Save file to disk def save_file(pagename, ext, data): """ pagename = string ext = string data = object """ path = f'{ STATIC_FOLDER_PATH }/{ pagename }.{ ext }' print(f'Saving { ext }:', path) try: out = open(path, 'w') except OSError: print("Could not open/write file:", path) sys.exit() with out: #open(path, 'w') as out: if ext == 'json': out.write( json.dumps(data, indent = 2) ) else: out.write( data ) out.close() return data # do API request and return JSON def do_API_request(url, filename="", wiki=""): """ url = API request url (string) data = { 'query': 'pages' : pageid : { 'links' : { '?' : '?' 'title' : 'pagename' } } } } """ purge(filename, wiki) print('Loading from wiki: ', url) response = urllib.request.urlopen(url) response_type = response.getheader('Content-Type') if response.status == 200 and "json" in response_type: contents = response.read() data = json.loads(contents) return data # api calls seem to be cached even when called with maxage # So call purge before doing the api call. # https://www.mediawiki.org/wiki/API:Purge def purge(filename, wiki): if(filename=="" or wiki==""): return print("purge " + filename ) import requests S = requests.Session() URL = f'{ wiki }/api.php' # url = f'{ wiki }/api.php?action=query&list=allimages&aifrom={ filename }&format=json' PARAMS = { "action": "purge", "titles": filename, "format": "json", "generator": "alltransclusions", } R = S.post(url=URL, params=PARAMS) # DATA = R.text # updates a publication's last updated feild in the index def update_publication_date(wiki, subject_ns, pagename, updated): """ wiki = string subject_ns = object pagename = string updated = string """ index = get_index(wiki, subject_ns) for page in index['pages']: if page['slug'] == pagename: page['updated'] = updated save_file('index', 'json', index) def customTemplate(name): path = "custom/%s.html" % name if os.path.isfile(os.path.join(os.path.dirname(__file__), "templates/", path)): return path else: return None # Beautiful soup seems to have a problem with some comments, # so lets remove them before parsing. def remove_comments( html ): """ html = string (HTML) """ pattern = r'()|()|(