wip of #4

2021-09-20 22:06:28 +02:00 · 2021-09-20 22:06:28 +02:00 · 611bea8f24
commit 611bea8f24
parent fbb22d64fc
1 changed files with 15 additions and 3 deletions
--- a/rss_aggregator.py
+++ b/rss_aggregator.py
@ -95,10 +95,10 @@ def create_post(post_dir, entry):

    parsed_content = parse_posts(post_dir, post_content)

-    with open(os.path.join(post_dir,'index.html'),'w') as f:
+    with open(os.path.join(post_dir,'index.html'),'w') as f: #n.b. .html 
        post = template.render(frontmatter=frontmatter, content=parsed_content)
        f.write(post)
-        print('created post for', entry.title, '({})'.format(entry.link))
+        #print('created post for', entry.title, '({})'.format(entry.link))

 def grab_media(post_directory, url):
    """
@ -124,18 +124,27 @@ def grab_media(post_directory, url):
    return url


-def parse_posts(post_direntry, post_content):
+def parse_posts(post_dir, post_content):
    """
    parse the post content to for media items
    replace foreign media item with local copy
    """
    soup = BeautifulSoup(post_content, "html.parser")
+    video_sources = ['youtube.com', 'vimeo.com']
    media = []
+
    for img in soup(['img','object']):
        local_image = grab_media(post_dir, img['src'])
        if img['src'] != local_image:
            print(img['src'], '->', local_image)
            img['src'] = local_image
+
+    for iframe in soup(['iframe']):
+        #TODO figure out how to throw out blocklisted iframes while comparing
+        if video_sources[0] or video_sources[1] not in iframe['src']:
+            print(iframe)
+            #iframe.decompose()
+
    return soup.decode()

 def grab_feed(feed_url):
@ -188,6 +197,9 @@ if not os.path.exists(output_dir):

 template = env.get_template('post_template.md')

+#add iframe to the allowlist of feedparser's sanitizer,
+#this is now handled in parse_post() 
+feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {'iframe'}   

 for feed_url in feed_urls: