now filters out iframe sources not in allowlist, wip #4

This commit is contained in:
rra 2021-09-23 21:29:25 +02:00
parent 17d7faac4d
commit 0554e93de2

View File

@ -121,7 +121,7 @@ def grab_media(post_directory, url):
return image
except Exception as e:
print('Failed to download cover image', url)
print('Failed to download image', url)
print(e)
return url
@ -129,24 +129,22 @@ def grab_media(post_directory, url):
def parse_posts(post_dir, post_content):
"""
parse the post content to for media items
replace foreign media item with local copy
replace foreign image with local copy
filter out iframe sources not in allowlist
"""
soup = BeautifulSoup(post_content, "html.parser")
video_sources = ['youtube.com', 'vimeo.com']
allowed_iframe_sources = ['youtube.com', 'vimeo.com', 'tv.lumbung.space']
media = []
for img in soup(['img','object']):
local_image = grab_media(post_dir, img['src'])
if img['src'] != local_image:
print(img['src'], '->', local_image)
img['src'] = local_image
for iframe in soup(['iframe']):
#TODO figure out how to throw out blocklisted iframes while comparing
if video_sources[0] or video_sources[1] not in iframe['src']:
print(iframe)
#iframe.decompose()
if not any(source in iframe['src'] for source in allowed_iframe_sources):
print('filtered iframe: {}...'.format(iframe['src'][:25]))
iframe.decompose()
return soup.decode()
def grab_feed(feed_url):