158 lines
4.1 KiB
Python
158 lines
4.1 KiB
Python
#! /usr/bin/env python2
|
||
# -*- coding: utf-8 -*-
|
||
|
||
from bs4 import BeautifulSoup as bs
|
||
from bs4 import NavigableString, Tag
|
||
from urllib.request import urlopen
|
||
from urllib.parse import urlparse, urljoin
|
||
import codecs
|
||
import copy
|
||
import re
|
||
|
||
|
||
template = """<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<title>Data Workers</title>
|
||
<!-- <link rel="stylesheet" href="stylesheet.css"> -->
|
||
</head>
|
||
<body>
|
||
</body>
|
||
</html>"""
|
||
|
||
def makeBranch(level, tags, soup):
|
||
branch = soup.new_tag('ul')
|
||
leaf = None
|
||
while len(tags) > 0:
|
||
t = tags[0]
|
||
if t['level'] > level and leaf:
|
||
leaf.append(makeBranch(t['level'], tags, soup))
|
||
elif t['level'] < level:
|
||
if (leaf):
|
||
branch.append(leaf)
|
||
leaf = None
|
||
return branch
|
||
else:
|
||
if (leaf):
|
||
branch.append(leaf)
|
||
leaf = None
|
||
|
||
leaf = soup.new_tag('li')
|
||
leaf.append(tagContent(tags[0]['tag']))
|
||
tags.pop(0)
|
||
if (leaf):
|
||
branch.append(leaf)
|
||
leaf = None
|
||
return branch
|
||
|
||
def makeIndex(soup):
|
||
eligible_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'li']
|
||
buffer = soup.new_tag('ul')
|
||
tags = [{'tag': copy.copy(tag), 'level': eligible_tags.index(
|
||
tag.name)} for tag in soup.find_all(eligible_tags)]
|
||
return makeBranch(tags[0]['level'], tags, soup)
|
||
|
||
def tagContent(tag):
|
||
if tag.string:
|
||
return tag.string
|
||
else:
|
||
return ''.join(tag.strings)
|
||
|
||
def classSafeContent(string):
|
||
return re.sub(r'[^\w\-]+', '', re.sub(r'\s+', '-', string.lower()))
|
||
|
||
def makeLemma(title, lemma_type, url, bigSoup):
|
||
print(url)
|
||
try:
|
||
lemma = ''
|
||
lemmaSoup = bs(urlopen('{}?action=render'.format(url)), 'html.parser')
|
||
lemma = bigSoup.new_tag("section")
|
||
lemma['class'] = 'lemma {} {}'.format(classSafeContent(title), classSafeContent(lemma_type))
|
||
|
||
header = bigSoup.new_tag('h3')
|
||
header['class'] = 'lemmaheader'
|
||
header.append(title)
|
||
|
||
lemma.append(header)
|
||
|
||
for t in lemmaSoup.contents:
|
||
if isinstance(t, Tag):
|
||
lemma.append(copy.copy(t))
|
||
|
||
for img in lemma.find_all('img'):
|
||
img.attrs['src'] = urljoin(baseurl, img.attrs['src'])
|
||
except:
|
||
print('└──> This page does not exist (yet).')
|
||
return lemma
|
||
|
||
|
||
def pageBreaker(soup):
|
||
breaker = soup.new_tag('section')
|
||
breaker.attrs['class'] = 'page-breaker'
|
||
breaker.string = ' '
|
||
return breaker
|
||
|
||
def get_html_from_wiki(lang, url, baseurl):
|
||
print('---\n', lang, url, '\n---')
|
||
soup = bs(template, 'html.parser')
|
||
pageSoup = bs(urlopen(url), 'html.parser')
|
||
container = soup.new_tag('section')
|
||
container['class'] = 'language {}'.format(lang)
|
||
|
||
# Add a cover
|
||
# cover = soup.new_tag('section')
|
||
# cover.attrs['class'] = 'cover'
|
||
# cover_img = soup.new_tag('img')
|
||
# cover_img.attrs['src'] = 'img/dw.bw.no-info-text.png'
|
||
# cover.append(cover_img)
|
||
# soup.append(cover)
|
||
|
||
# Add an index
|
||
# index = soup.new_tag('section')
|
||
# index.attrs['class'] = 'index'
|
||
# title = soup.new_tag('div')
|
||
# title.attrs['class'] = 'title'
|
||
# index.append('Data Workers')
|
||
# index.append(makeIndex(pageSoup))
|
||
# soup.append(index)
|
||
|
||
for child in pageSoup.contents:
|
||
# print(child.name)
|
||
if child.name == 'ul':
|
||
chapter = soup.new_tag('section')
|
||
chapter['class'] = 'group'
|
||
|
||
for li in child.find_all('li'):
|
||
links = li.find_all('a')
|
||
if links:
|
||
url = urljoin(baseurl, links[-1].attrs['href'])
|
||
if re.match('.*algolit.net$', urlparse(url).netloc) or re.match('.*algolit.constantvzw.org$', urlparse(url).netloc):
|
||
title = tagContent(links[-1])
|
||
if 'stories' in title.lower():
|
||
lemma_type = 'stories'
|
||
else:
|
||
lemma_type = 'works'
|
||
chapter.append(makeLemma(title, lemma_type, url, soup))
|
||
|
||
container.append(chapter)
|
||
|
||
else:
|
||
container.append(copy.copy(child))
|
||
|
||
for header in container.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||
header.attrs['id'] = classSafeContent(tagContent(header))
|
||
header.replace_with(header)
|
||
|
||
soup.body.append(container)
|
||
|
||
with codecs.open('data-workers.{}.html'.format(lang), 'w+') as out:
|
||
out.write(str(soup))
|
||
out.close()
|
||
|
||
# baseurl = 'http://www.algolit.net'
|
||
# language = 'en'
|
||
# url = 'http://www.algolit.net/index.php/Data_Workers?action=render'
|
||
# language = 'fr'
|
||
# url = 'http://www.algolit.net/index.php/Data_Workers_FR?action=render'
|
||
# get_html_from_wiki(language, url, baseurl) |