started with a splitter for h2 headers

This commit is contained in:
manetta 2021-09-08 17:42:03 +02:00
parent 1a0c35dc03
commit a97430278d

View File

@ -80,6 +80,16 @@ def download_media(html, images, wiki):
return html
def split_h2_header(html):
"""
html = string (HTML)
DOES NOT WORK YET!
"""
pattern = '<h2><span class="mw-headline" id=".*?">.*?:.*?</h2>' # split the h2 in two on the ":"
result = re.split(pattern, html)
# print(result[0])
return html
def clean_up(html):
"""
html = string (HTML)
@ -103,6 +113,7 @@ def parse_page(pagename, wiki):
images = data['parse']['images']
html = download_media(html, images, wiki)
html = clean_up(html)
html = split_h2_header(html)
else:
html = None