Browse Source

wip of #4

master
rra 3 years ago
parent
commit
611bea8f24
  1. 18
      rss_aggregator.py

18
rss_aggregator.py

@ -95,10 +95,10 @@ def create_post(post_dir, entry):
parsed_content = parse_posts(post_dir, post_content) parsed_content = parse_posts(post_dir, post_content)
with open(os.path.join(post_dir,'index.html'),'w') as f: with open(os.path.join(post_dir,'index.html'),'w') as f: #n.b. .html
post = template.render(frontmatter=frontmatter, content=parsed_content) post = template.render(frontmatter=frontmatter, content=parsed_content)
f.write(post) f.write(post)
print('created post for', entry.title, '({})'.format(entry.link)) #print('created post for', entry.title, '({})'.format(entry.link))
def grab_media(post_directory, url): def grab_media(post_directory, url):
""" """
@ -124,18 +124,27 @@ def grab_media(post_directory, url):
return url return url
def parse_posts(post_direntry, post_content): def parse_posts(post_dir, post_content):
""" """
parse the post content to for media items parse the post content to for media items
replace foreign media item with local copy replace foreign media item with local copy
""" """
soup = BeautifulSoup(post_content, "html.parser") soup = BeautifulSoup(post_content, "html.parser")
video_sources = ['youtube.com', 'vimeo.com']
media = [] media = []
for img in soup(['img','object']): for img in soup(['img','object']):
local_image = grab_media(post_dir, img['src']) local_image = grab_media(post_dir, img['src'])
if img['src'] != local_image: if img['src'] != local_image:
print(img['src'], '->', local_image) print(img['src'], '->', local_image)
img['src'] = local_image img['src'] = local_image
for iframe in soup(['iframe']):
#TODO figure out how to throw out blocklisted iframes while comparing
if video_sources[0] or video_sources[1] not in iframe['src']:
print(iframe)
#iframe.decompose()
return soup.decode() return soup.decode()
def grab_feed(feed_url): def grab_feed(feed_url):
@ -188,6 +197,9 @@ if not os.path.exists(output_dir):
template = env.get_template('post_template.md') template = env.get_template('post_template.md')
#add iframe to the allowlist of feedparser's sanitizer,
#this is now handled in parse_post()
feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {'iframe'}
for feed_url in feed_urls: for feed_url in feed_urls:

Loading…
Cancel
Save