|
@ -121,7 +121,7 @@ def grab_media(post_directory, url): |
|
|
return image |
|
|
return image |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print('Failed to download cover image', url) |
|
|
print('Failed to download image', url) |
|
|
print(e) |
|
|
print(e) |
|
|
return url |
|
|
return url |
|
|
|
|
|
|
|
@ -129,24 +129,22 @@ def grab_media(post_directory, url): |
|
|
def parse_posts(post_dir, post_content): |
|
|
def parse_posts(post_dir, post_content): |
|
|
""" |
|
|
""" |
|
|
parse the post content to for media items |
|
|
parse the post content to for media items |
|
|
replace foreign media item with local copy |
|
|
replace foreign image with local copy |
|
|
|
|
|
filter out iframe sources not in allowlist |
|
|
""" |
|
|
""" |
|
|
soup = BeautifulSoup(post_content, "html.parser") |
|
|
soup = BeautifulSoup(post_content, "html.parser") |
|
|
video_sources = ['youtube.com', 'vimeo.com'] |
|
|
allowed_iframe_sources = ['youtube.com', 'vimeo.com', 'tv.lumbung.space'] |
|
|
media = [] |
|
|
media = [] |
|
|
|
|
|
|
|
|
for img in soup(['img','object']): |
|
|
for img in soup(['img','object']): |
|
|
local_image = grab_media(post_dir, img['src']) |
|
|
local_image = grab_media(post_dir, img['src']) |
|
|
if img['src'] != local_image: |
|
|
if img['src'] != local_image: |
|
|
print(img['src'], '->', local_image) |
|
|
|
|
|
img['src'] = local_image |
|
|
img['src'] = local_image |
|
|
|
|
|
|
|
|
for iframe in soup(['iframe']): |
|
|
for iframe in soup(['iframe']): |
|
|
#TODO figure out how to throw out blocklisted iframes while comparing |
|
|
if not any(source in iframe['src'] for source in allowed_iframe_sources): |
|
|
if video_sources[0] or video_sources[1] not in iframe['src']: |
|
|
print('filtered iframe: {}...'.format(iframe['src'][:25])) |
|
|
print(iframe) |
|
|
iframe.decompose() |
|
|
#iframe.decompose() |
|
|
|
|
|
|
|
|
|
|
|
return soup.decode() |
|
|
return soup.decode() |
|
|
|
|
|
|
|
|
def grab_feed(feed_url): |
|
|
def grab_feed(feed_url): |
|
|