|
@ -80,6 +80,16 @@ def download_media(html, images, wiki): |
|
|
|
|
|
|
|
|
return html |
|
|
return html |
|
|
|
|
|
|
|
|
|
|
|
def split_h2_header(html): |
|
|
|
|
|
""" |
|
|
|
|
|
html = string (HTML) |
|
|
|
|
|
DOES NOT WORK YET! |
|
|
|
|
|
""" |
|
|
|
|
|
pattern = '<h2><span class="mw-headline" id=".*?">.*?:.*?</h2>' # split the h2 in two on the ":" |
|
|
|
|
|
result = re.split(pattern, html) |
|
|
|
|
|
# print(result[0]) |
|
|
|
|
|
return html |
|
|
|
|
|
|
|
|
def clean_up(html): |
|
|
def clean_up(html): |
|
|
""" |
|
|
""" |
|
|
html = string (HTML) |
|
|
html = string (HTML) |
|
@ -103,6 +113,7 @@ def parse_page(pagename, wiki): |
|
|
images = data['parse']['images'] |
|
|
images = data['parse']['images'] |
|
|
html = download_media(html, images, wiki) |
|
|
html = download_media(html, images, wiki) |
|
|
html = clean_up(html) |
|
|
html = clean_up(html) |
|
|
|
|
|
html = split_h2_header(html) |
|
|
else: |
|
|
else: |
|
|
html = None |
|
|
html = None |
|
|
|
|
|
|
|
|