now filters out iframe sources not in allowlist, wip #4

This commit is contained in:
rra 2021-09-23 21:29:25 +02:00
parent 17d7faac4d
commit 0554e93de2

View File

@ -121,7 +121,7 @@ def grab_media(post_directory, url):
return image return image
except Exception as e: except Exception as e:
print('Failed to download cover image', url) print('Failed to download image', url)
print(e) print(e)
return url return url
@ -129,24 +129,22 @@ def grab_media(post_directory, url):
def parse_posts(post_dir, post_content): def parse_posts(post_dir, post_content):
""" """
parse the post content to for media items parse the post content to for media items
replace foreign media item with local copy replace foreign image with local copy
filter out iframe sources not in allowlist
""" """
soup = BeautifulSoup(post_content, "html.parser") soup = BeautifulSoup(post_content, "html.parser")
video_sources = ['youtube.com', 'vimeo.com'] allowed_iframe_sources = ['youtube.com', 'vimeo.com', 'tv.lumbung.space']
media = [] media = []
for img in soup(['img','object']): for img in soup(['img','object']):
local_image = grab_media(post_dir, img['src']) local_image = grab_media(post_dir, img['src'])
if img['src'] != local_image: if img['src'] != local_image:
print(img['src'], '->', local_image)
img['src'] = local_image img['src'] = local_image
for iframe in soup(['iframe']): for iframe in soup(['iframe']):
#TODO figure out how to throw out blocklisted iframes while comparing if not any(source in iframe['src'] for source in allowed_iframe_sources):
if video_sources[0] or video_sources[1] not in iframe['src']: print('filtered iframe: {}...'.format(iframe['src'][:25]))
print(iframe) iframe.decompose()
#iframe.decompose()
return soup.decode() return soup.decode()
def grab_feed(feed_url): def grab_feed(feed_url):