data-workers-publication/get_html_from_wiki.py

158 lines
4.1 KiB
Python
Raw Normal View History

2019-03-25 08:35:09 +01:00
#! /usr/bin/env python2
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString, Tag
from urllib.request import urlopen
from urllib.parse import urlparse, urljoin
import codecs
import copy
import re
template = """<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Data Workers</title>
<!-- <link rel="stylesheet" href="stylesheet.css"> -->
</head>
<body>
</body>
</html>"""
def makeBranch(level, tags, soup):
branch = soup.new_tag('ul')
leaf = None
while len(tags) > 0:
t = tags[0]
if t['level'] > level and leaf:
leaf.append(makeBranch(t['level'], tags, soup))
elif t['level'] < level:
if (leaf):
branch.append(leaf)
leaf = None
return branch
else:
if (leaf):
branch.append(leaf)
leaf = None
leaf = soup.new_tag('li')
leaf.append(tagContent(tags[0]['tag']))
tags.pop(0)
if (leaf):
branch.append(leaf)
leaf = None
return branch
def makeIndex(soup):
eligible_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'li']
buffer = soup.new_tag('ul')
tags = [{'tag': copy.copy(tag), 'level': eligible_tags.index(
tag.name)} for tag in soup.find_all(eligible_tags)]
return makeBranch(tags[0]['level'], tags, soup)
def tagContent(tag):
if tag.string:
return tag.string
else:
return ''.join(tag.strings)
def classSafeContent(string):
return re.sub(r'[^\w\-]+', '', re.sub(r'\s+', '-', string.lower()))
def makeLemma(title, lemma_type, url, bigSoup):
print(url)
try:
lemma = ''
lemmaSoup = bs(urlopen('{}?action=render'.format(url)), 'html.parser')
lemma = bigSoup.new_tag("section")
lemma['class'] = 'lemma {} {}'.format(classSafeContent(title), classSafeContent(lemma_type))
header = bigSoup.new_tag('h3')
header['class'] = 'lemmaheader'
header.append(title)
lemma.append(header)
for t in lemmaSoup.contents:
if isinstance(t, Tag):
lemma.append(copy.copy(t))
for img in lemma.find_all('img'):
img.attrs['src'] = urljoin(baseurl, img.attrs['src'])
except:
print('└──> This page does not exist (yet).')
return lemma
def pageBreaker(soup):
breaker = soup.new_tag('section')
breaker.attrs['class'] = 'page-breaker'
breaker.string = ' '
return breaker
def get_html_from_wiki(lang, url, baseurl):
print('---\n', lang, url, '\n---')
soup = bs(template, 'html.parser')
pageSoup = bs(urlopen(url), 'html.parser')
container = soup.new_tag('section')
container['class'] = 'language {}'.format(lang)
# Add a cover
# cover = soup.new_tag('section')
# cover.attrs['class'] = 'cover'
# cover_img = soup.new_tag('img')
# cover_img.attrs['src'] = 'img/dw.bw.no-info-text.png'
# cover.append(cover_img)
# soup.append(cover)
# Add an index
# index = soup.new_tag('section')
# index.attrs['class'] = 'index'
# title = soup.new_tag('div')
# title.attrs['class'] = 'title'
# index.append('Data Workers')
# index.append(makeIndex(pageSoup))
# soup.append(index)
for child in pageSoup.contents:
# print(child.name)
if child.name == 'ul':
chapter = soup.new_tag('section')
chapter['class'] = 'group'
for li in child.find_all('li'):
links = li.find_all('a')
if links:
url = urljoin(baseurl, links[-1].attrs['href'])
if re.match('.*algolit.net$', urlparse(url).netloc) or re.match('.*algolit.constantvzw.org$', urlparse(url).netloc):
title = tagContent(links[-1])
if 'stories' in title.lower():
lemma_type = 'stories'
else:
lemma_type = 'works'
chapter.append(makeLemma(title, lemma_type, url, soup))
container.append(chapter)
else:
container.append(copy.copy(child))
for header in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
header.attrs['id'] = classSafeContent(tagContent(header))
header.replace_with(header)
soup.body.append(container)
with codecs.open('data-workers.{}.html'.format(lang), 'w+') as out:
out.write(str(soup))
out.close()
# baseurl = 'http://www.algolit.net'
# language = 'en'
# url = 'http://www.algolit.net/index.php/Data_Workers?action=render'
# language = 'fr'
# url = 'http://www.algolit.net/index.php/Data_Workers_FR?action=render'
# get_html_from_wiki(language, url, baseurl)