started with a splitter for h2 headers
This commit is contained in:
parent
1a0c35dc03
commit
a97430278d
@ -80,6 +80,16 @@ def download_media(html, images, wiki):
|
||||
|
||||
return html
|
||||
|
||||
def split_h2_header(html):
|
||||
"""
|
||||
html = string (HTML)
|
||||
DOES NOT WORK YET!
|
||||
"""
|
||||
pattern = '<h2><span class="mw-headline" id=".*?">.*?:.*?</h2>' # split the h2 in two on the ":"
|
||||
result = re.split(pattern, html)
|
||||
# print(result[0])
|
||||
return html
|
||||
|
||||
def clean_up(html):
|
||||
"""
|
||||
html = string (HTML)
|
||||
@ -103,6 +113,7 @@ def parse_page(pagename, wiki):
|
||||
images = data['parse']['images']
|
||||
html = download_media(html, images, wiki)
|
||||
html = clean_up(html)
|
||||
html = split_h2_header(html)
|
||||
else:
|
||||
html = None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user