data-workers-publication/get_html_from_wiki.py
2019-03-25 08:35:09 +01:00

158 lines
4.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#! /usr/bin/env python2
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString, Tag
from urllib.request import urlopen
from urllib.parse import urlparse, urljoin
import codecs
import copy
import re
template = """<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Data Workers</title>
<!-- <link rel="stylesheet" href="stylesheet.css"> -->
</head>
<body>
</body>
</html>"""
def makeBranch(level, tags, soup):
branch = soup.new_tag('ul')
leaf = None
while len(tags) > 0:
t = tags[0]
if t['level'] > level and leaf:
leaf.append(makeBranch(t['level'], tags, soup))
elif t['level'] < level:
if (leaf):
branch.append(leaf)
leaf = None
return branch
else:
if (leaf):
branch.append(leaf)
leaf = None
leaf = soup.new_tag('li')
leaf.append(tagContent(tags[0]['tag']))
tags.pop(0)
if (leaf):
branch.append(leaf)
leaf = None
return branch
def makeIndex(soup):
eligible_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'li']
buffer = soup.new_tag('ul')
tags = [{'tag': copy.copy(tag), 'level': eligible_tags.index(
tag.name)} for tag in soup.find_all(eligible_tags)]
return makeBranch(tags[0]['level'], tags, soup)
def tagContent(tag):
if tag.string:
return tag.string
else:
return ''.join(tag.strings)
def classSafeContent(string):
return re.sub(r'[^\w\-]+', '', re.sub(r'\s+', '-', string.lower()))
def makeLemma(title, lemma_type, url, bigSoup):
print(url)
try:
lemma = ''
lemmaSoup = bs(urlopen('{}?action=render'.format(url)), 'html.parser')
lemma = bigSoup.new_tag("section")
lemma['class'] = 'lemma {} {}'.format(classSafeContent(title), classSafeContent(lemma_type))
header = bigSoup.new_tag('h3')
header['class'] = 'lemmaheader'
header.append(title)
lemma.append(header)
for t in lemmaSoup.contents:
if isinstance(t, Tag):
lemma.append(copy.copy(t))
for img in lemma.find_all('img'):
img.attrs['src'] = urljoin(baseurl, img.attrs['src'])
except:
print('└──> This page does not exist (yet).')
return lemma
def pageBreaker(soup):
breaker = soup.new_tag('section')
breaker.attrs['class'] = 'page-breaker'
breaker.string = ' '
return breaker
def get_html_from_wiki(lang, url, baseurl):
print('---\n', lang, url, '\n---')
soup = bs(template, 'html.parser')
pageSoup = bs(urlopen(url), 'html.parser')
container = soup.new_tag('section')
container['class'] = 'language {}'.format(lang)
# Add a cover
# cover = soup.new_tag('section')
# cover.attrs['class'] = 'cover'
# cover_img = soup.new_tag('img')
# cover_img.attrs['src'] = 'img/dw.bw.no-info-text.png'
# cover.append(cover_img)
# soup.append(cover)
# Add an index
# index = soup.new_tag('section')
# index.attrs['class'] = 'index'
# title = soup.new_tag('div')
# title.attrs['class'] = 'title'
# index.append('Data Workers')
# index.append(makeIndex(pageSoup))
# soup.append(index)
for child in pageSoup.contents:
# print(child.name)
if child.name == 'ul':
chapter = soup.new_tag('section')
chapter['class'] = 'group'
for li in child.find_all('li'):
links = li.find_all('a')
if links:
url = urljoin(baseurl, links[-1].attrs['href'])
if re.match('.*algolit.net$', urlparse(url).netloc) or re.match('.*algolit.constantvzw.org$', urlparse(url).netloc):
title = tagContent(links[-1])
if 'stories' in title.lower():
lemma_type = 'stories'
else:
lemma_type = 'works'
chapter.append(makeLemma(title, lemma_type, url, soup))
container.append(chapter)
else:
container.append(copy.copy(child))
for header in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
header.attrs['id'] = classSafeContent(tagContent(header))
header.replace_with(header)
soup.body.append(container)
with codecs.open('data-workers.{}.html'.format(lang), 'w+') as out:
out.write(str(soup))
out.close()
# baseurl = 'http://www.algolit.net'
# language = 'en'
# url = 'http://www.algolit.net/index.php/Data_Workers?action=render'
# language = 'fr'
# url = 'http://www.algolit.net/index.php/Data_Workers_FR?action=render'
# get_html_from_wiki(language, url, baseurl)