diff --git a/rss_aggregator.py b/rss_aggregator.py index e8d52d6..4731686 100644 --- a/rss_aggregator.py +++ b/rss_aggregator.py @@ -95,10 +95,10 @@ def create_post(post_dir, entry): parsed_content = parse_posts(post_dir, post_content) - with open(os.path.join(post_dir,'index.html'),'w') as f: + with open(os.path.join(post_dir,'index.html'),'w') as f: #n.b. .html post = template.render(frontmatter=frontmatter, content=parsed_content) f.write(post) - print('created post for', entry.title, '({})'.format(entry.link)) + #print('created post for', entry.title, '({})'.format(entry.link)) def grab_media(post_directory, url): """ @@ -124,18 +124,27 @@ def grab_media(post_directory, url): return url -def parse_posts(post_direntry, post_content): +def parse_posts(post_dir, post_content): """ parse the post content to for media items replace foreign media item with local copy """ soup = BeautifulSoup(post_content, "html.parser") + video_sources = ['youtube.com', 'vimeo.com'] media = [] + for img in soup(['img','object']): local_image = grab_media(post_dir, img['src']) if img['src'] != local_image: print(img['src'], '->', local_image) img['src'] = local_image + + for iframe in soup(['iframe']): + #TODO figure out how to throw out blocklisted iframes while comparing + if video_sources[0] or video_sources[1] not in iframe['src']: + print(iframe) + #iframe.decompose() + return soup.decode() def grab_feed(feed_url): @@ -188,6 +197,9 @@ if not os.path.exists(output_dir): template = env.get_template('post_template.md') +#add iframe to the allowlist of feedparser's sanitizer, +#this is now handled in parse_post() +feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {'iframe'} for feed_url in feed_urls: