158 lines
4.1 KiB
Python
158 lines
4.1 KiB
Python
|
#! /usr/bin/env python2
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
|
|||
|
from bs4 import BeautifulSoup as bs
|
|||
|
from bs4 import NavigableString, Tag
|
|||
|
from urllib.request import urlopen
|
|||
|
from urllib.parse import urlparse, urljoin
|
|||
|
import codecs
|
|||
|
import copy
|
|||
|
import re
|
|||
|
|
|||
|
|
|||
|
template = """<!DOCTYPE html>
|
|||
|
<html>
|
|||
|
<head>
|
|||
|
<meta charset="utf-8">
|
|||
|
<title>Data Workers</title>
|
|||
|
<!-- <link rel="stylesheet" href="stylesheet.css"> -->
|
|||
|
</head>
|
|||
|
<body>
|
|||
|
</body>
|
|||
|
</html>"""
|
|||
|
|
|||
|
def makeBranch(level, tags, soup):
|
|||
|
branch = soup.new_tag('ul')
|
|||
|
leaf = None
|
|||
|
while len(tags) > 0:
|
|||
|
t = tags[0]
|
|||
|
if t['level'] > level and leaf:
|
|||
|
leaf.append(makeBranch(t['level'], tags, soup))
|
|||
|
elif t['level'] < level:
|
|||
|
if (leaf):
|
|||
|
branch.append(leaf)
|
|||
|
leaf = None
|
|||
|
return branch
|
|||
|
else:
|
|||
|
if (leaf):
|
|||
|
branch.append(leaf)
|
|||
|
leaf = None
|
|||
|
|
|||
|
leaf = soup.new_tag('li')
|
|||
|
leaf.append(tagContent(tags[0]['tag']))
|
|||
|
tags.pop(0)
|
|||
|
if (leaf):
|
|||
|
branch.append(leaf)
|
|||
|
leaf = None
|
|||
|
return branch
|
|||
|
|
|||
|
def makeIndex(soup):
|
|||
|
eligible_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'li']
|
|||
|
buffer = soup.new_tag('ul')
|
|||
|
tags = [{'tag': copy.copy(tag), 'level': eligible_tags.index(
|
|||
|
tag.name)} for tag in soup.find_all(eligible_tags)]
|
|||
|
return makeBranch(tags[0]['level'], tags, soup)
|
|||
|
|
|||
|
def tagContent(tag):
|
|||
|
if tag.string:
|
|||
|
return tag.string
|
|||
|
else:
|
|||
|
return ''.join(tag.strings)
|
|||
|
|
|||
|
def classSafeContent(string):
|
|||
|
return re.sub(r'[^\w\-]+', '', re.sub(r'\s+', '-', string.lower()))
|
|||
|
|
|||
|
def makeLemma(title, lemma_type, url, bigSoup):
|
|||
|
print(url)
|
|||
|
try:
|
|||
|
lemma = ''
|
|||
|
lemmaSoup = bs(urlopen('{}?action=render'.format(url)), 'html.parser')
|
|||
|
lemma = bigSoup.new_tag("section")
|
|||
|
lemma['class'] = 'lemma {} {}'.format(classSafeContent(title), classSafeContent(lemma_type))
|
|||
|
|
|||
|
header = bigSoup.new_tag('h3')
|
|||
|
header['class'] = 'lemmaheader'
|
|||
|
header.append(title)
|
|||
|
|
|||
|
lemma.append(header)
|
|||
|
|
|||
|
for t in lemmaSoup.contents:
|
|||
|
if isinstance(t, Tag):
|
|||
|
lemma.append(copy.copy(t))
|
|||
|
|
|||
|
for img in lemma.find_all('img'):
|
|||
|
img.attrs['src'] = urljoin(baseurl, img.attrs['src'])
|
|||
|
except:
|
|||
|
print('└──> This page does not exist (yet).')
|
|||
|
return lemma
|
|||
|
|
|||
|
|
|||
|
def pageBreaker(soup):
|
|||
|
breaker = soup.new_tag('section')
|
|||
|
breaker.attrs['class'] = 'page-breaker'
|
|||
|
breaker.string = ' '
|
|||
|
return breaker
|
|||
|
|
|||
|
def get_html_from_wiki(lang, url, baseurl):
|
|||
|
print('---\n', lang, url, '\n---')
|
|||
|
soup = bs(template, 'html.parser')
|
|||
|
pageSoup = bs(urlopen(url), 'html.parser')
|
|||
|
container = soup.new_tag('section')
|
|||
|
container['class'] = 'language {}'.format(lang)
|
|||
|
|
|||
|
# Add a cover
|
|||
|
# cover = soup.new_tag('section')
|
|||
|
# cover.attrs['class'] = 'cover'
|
|||
|
# cover_img = soup.new_tag('img')
|
|||
|
# cover_img.attrs['src'] = 'img/dw.bw.no-info-text.png'
|
|||
|
# cover.append(cover_img)
|
|||
|
# soup.append(cover)
|
|||
|
|
|||
|
# Add an index
|
|||
|
# index = soup.new_tag('section')
|
|||
|
# index.attrs['class'] = 'index'
|
|||
|
# title = soup.new_tag('div')
|
|||
|
# title.attrs['class'] = 'title'
|
|||
|
# index.append('Data Workers')
|
|||
|
# index.append(makeIndex(pageSoup))
|
|||
|
# soup.append(index)
|
|||
|
|
|||
|
for child in pageSoup.contents:
|
|||
|
# print(child.name)
|
|||
|
if child.name == 'ul':
|
|||
|
chapter = soup.new_tag('section')
|
|||
|
chapter['class'] = 'group'
|
|||
|
|
|||
|
for li in child.find_all('li'):
|
|||
|
links = li.find_all('a')
|
|||
|
if links:
|
|||
|
url = urljoin(baseurl, links[-1].attrs['href'])
|
|||
|
if re.match('.*algolit.net$', urlparse(url).netloc) or re.match('.*algolit.constantvzw.org$', urlparse(url).netloc):
|
|||
|
title = tagContent(links[-1])
|
|||
|
if 'stories' in title.lower():
|
|||
|
lemma_type = 'stories'
|
|||
|
else:
|
|||
|
lemma_type = 'works'
|
|||
|
chapter.append(makeLemma(title, lemma_type, url, soup))
|
|||
|
|
|||
|
container.append(chapter)
|
|||
|
|
|||
|
else:
|
|||
|
container.append(copy.copy(child))
|
|||
|
|
|||
|
for header in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
|||
|
header.attrs['id'] = classSafeContent(tagContent(header))
|
|||
|
header.replace_with(header)
|
|||
|
|
|||
|
soup.body.append(container)
|
|||
|
|
|||
|
with codecs.open('data-workers.{}.html'.format(lang), 'w+') as out:
|
|||
|
out.write(str(soup))
|
|||
|
out.close()
|
|||
|
|
|||
|
# baseurl = 'http://www.algolit.net'
|
|||
|
# language = 'en'
|
|||
|
# url = 'http://www.algolit.net/index.php/Data_Workers?action=render'
|
|||
|
# language = 'fr'
|
|||
|
# url = 'http://www.algolit.net/index.php/Data_Workers_FR?action=render'
|
|||
|
# get_html_from_wiki(language, url, baseurl)
|