Browse Source

started with a splitter for h2 headers

master
manetta 3 years ago
parent
commit
a97430278d
  1. 11
      command-line/update.py

11
command-line/update.py

@ -80,6 +80,16 @@ def download_media(html, images, wiki):
return html
def split_h2_header(html):
"""
html = string (HTML)
DOES NOT WORK YET!
"""
pattern = '<h2><span class="mw-headline" id=".*?">.*?:.*?</h2>' # split the h2 in two on the ":"
result = re.split(pattern, html)
# print(result[0])
return html
def clean_up(html):
"""
html = string (HTML)
@ -103,6 +113,7 @@ def parse_page(pagename, wiki):
images = data['parse']['images']
html = download_media(html, images, wiki)
html = clean_up(html)
html = split_h2_header(html)
else:
html = None

Loading…
Cancel
Save