From 0554e93de28601f4a1e2cc4fd6f500aedb595086 Mon Sep 17 00:00:00 2001 From: rra Date: Thu, 23 Sep 2021 21:29:25 +0200 Subject: [PATCH] now filters out iframe sources not in allowlist, wip #4 --- rss_aggregator.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/rss_aggregator.py b/rss_aggregator.py index 0e49e30..9ef40e1 100644 --- a/rss_aggregator.py +++ b/rss_aggregator.py @@ -121,7 +121,7 @@ def grab_media(post_directory, url): return image except Exception as e: - print('Failed to download cover image', url) + print('Failed to download image', url) print(e) return url @@ -129,24 +129,22 @@ def grab_media(post_directory, url): def parse_posts(post_dir, post_content): """ parse the post content to for media items - replace foreign media item with local copy + replace foreign image with local copy + filter out iframe sources not in allowlist """ soup = BeautifulSoup(post_content, "html.parser") - video_sources = ['youtube.com', 'vimeo.com'] + allowed_iframe_sources = ['youtube.com', 'vimeo.com', 'tv.lumbung.space'] media = [] for img in soup(['img','object']): local_image = grab_media(post_dir, img['src']) if img['src'] != local_image: - print(img['src'], '->', local_image) img['src'] = local_image for iframe in soup(['iframe']): - #TODO figure out how to throw out blocklisted iframes while comparing - if video_sources[0] or video_sources[1] not in iframe['src']: - print(iframe) - #iframe.decompose() - + if not any(source in iframe['src'] for source in allowed_iframe_sources): + print('filtered iframe: {}...'.format(iframe['src'][:25])) + iframe.decompose() return soup.decode() def grab_feed(feed_url):