started with a splitter for h2 headers
This commit is contained in:
parent
1a0c35dc03
commit
a97430278d
@ -80,6 +80,16 @@ def download_media(html, images, wiki):
|
|||||||
|
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
def split_h2_header(html):
|
||||||
|
"""
|
||||||
|
html = string (HTML)
|
||||||
|
DOES NOT WORK YET!
|
||||||
|
"""
|
||||||
|
pattern = '<h2><span class="mw-headline" id=".*?">.*?:.*?</h2>' # split the h2 in two on the ":"
|
||||||
|
result = re.split(pattern, html)
|
||||||
|
# print(result[0])
|
||||||
|
return html
|
||||||
|
|
||||||
def clean_up(html):
|
def clean_up(html):
|
||||||
"""
|
"""
|
||||||
html = string (HTML)
|
html = string (HTML)
|
||||||
@ -103,6 +113,7 @@ def parse_page(pagename, wiki):
|
|||||||
images = data['parse']['images']
|
images = data['parse']['images']
|
||||||
html = download_media(html, images, wiki)
|
html = download_media(html, images, wiki)
|
||||||
html = clean_up(html)
|
html = clean_up(html)
|
||||||
|
html = split_h2_header(html)
|
||||||
else:
|
else:
|
||||||
html = None
|
html = None
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user