From 162ecce865a09a33c91b8f0f8820211319dc0bd0 Mon Sep 17 00:00:00 2001 From: rra Date: Tue, 24 Aug 2021 21:18:46 +0200 Subject: [PATCH] fix #3, fix #1, add exception handling for weird edgecases --- rss_aggregator.py | 133 ++++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 52 deletions(-) diff --git a/rss_aggregator.py b/rss_aggregator.py index 13672b5..3edb13f 100644 --- a/rss_aggregator.py +++ b/rss_aggregator.py @@ -23,10 +23,10 @@ def write_etag(feed_name, feed_data): etag = '' modified = '' - if 'etag' in data: - etag = data.etag - if 'modified' in data: - modified = data.modified + if 'etag' in feed_data: + etag = feed_data.etag + if 'modified' in feed_data: + modified = feed_data.modified if etag or modified: with open(os.path.join('etags',feed_name +'.txt'),'w') as f: @@ -56,11 +56,16 @@ def create_frontmatter(entry): published = arrow.get(published) + if 'author' in entry: + author = entry.author + else: + author = '' + frontmatter = { 'title':entry.title, 'date': published.format(), 'summary': '', - 'author': entry.author, + 'author': author, 'original_link': entry.link } @@ -75,7 +80,10 @@ def create_post(post_dir, entry): if not os.path.exists(post_dir): os.makedirs(post_dir) - post_content = entry.content[0].value + if 'content' in entry: + post_content = entry.content[0].value + else: + post_content = entry.summary parsed_content = parse_posts(post_dir, post_content) @@ -91,20 +99,16 @@ def grab_media(post_directory, url): """ image = urlparse(url).path.split('/')[-1] - try: - #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc - response = requests.get(url, stream=True) - except Exception as e: - print(e) - return url - try: if not os.path.exists(os.path.join(post_directory, image)): - with open(os.path.join(post_directory, image), 'wb') as img_file: - shutil.copyfileobj(response.raw, img_file) - print('Downloaded cover image', image) - return image - return image + #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc + response = requests.get(url, stream=True) + if response.ok: + with open(os.path.join(post_directory, image), 'wb') as img_file: + shutil.copyfileobj(response.raw, img_file) + print('Downloaded cover image', image) + return image + return image except Exception as e: print('Failed to download cover image', url) @@ -120,14 +124,41 @@ def parse_posts(post_direntry, post_content): soup = BeautifulSoup(post_content, "html.parser") media = [] for img in soup(['img','object']): - local_image = grab_media(post_dir, img['src']) - if img['src'] != local_image: print(img['src'], '->', local_image) img['src'] = local_image return soup.decode() - + +def grab_feed(feed_url): + """ + check whether feed has been updated + download & return it if it has + """ + feed_name = urlparse(feed_url).netloc + + etag, modified = get_etag(feed_name) + + try: + if modified: + data = feedparser.parse(feed_url, modified=modified) + elif etag: + data = feedparser.parse(feed_url, etag=etag) + else: + data = feedparser.parse(feed_url) + except Exception as e: + print('Error grabbing feed') + print(feed_name) + print(e) + return False + + print(data.status, feed_url) + if data.status == 200: + #304 means the feed has not been modified since we last checked + write_etag(feed_name, data) + return data + return False + feed_urls = open('feeds_list.txt','r').read().splitlines() @@ -141,8 +172,8 @@ env = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.curdir) ) -output_dir = os.environ.get('OUTPUT_DIR', '/home/r/Programming/lumbung.space/lumbung.space-web/content/posts/') -#output_dir = os.environ.get('OUTPUT_DIR', 'network/') +#output_dir = os.environ.get('OUTPUT_DIR', '/home/r/Programming/lumbung.space/lumbung.space-web/content/posts/') +output_dir = os.environ.get('OUTPUT_DIR', 'network/') if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -150,42 +181,40 @@ if not os.path.exists(output_dir): template = env.get_template('post_template.md') -for feed_url in feed_urls[7:]: - feed_name = urlparse(feed_url).netloc +for feed_url in feed_urls: - etag, modified = get_etag(feed_name) - if modified: - data = feedparser.parse(feed_url, modified=modified) - elif etag: - data = feedparser.parse(feed_url, etag=etag) - else: - data = feedparser.parse(feed_url) + feed_name = urlparse(feed_url).netloc - print(data.status, feed_url) + feed_dir = os.path.join(output_dir, feed_name) - if data.status == 200: + if not os.path.exists(feed_dir): + os.makedirs(feed_dir) - #write_etag(feed_url, data) - - # if 'title' in data.feed: - # print('#'*10) - # print(data.feed.title) - # print('#'*10) - # print('\n') + existing_posts = os.listdir(feed_dir) - # print('FEED KEYS') - # print(data.keys()) - # print('\n') + data = grab_feed(feed_url) + if data: for entry in data.entries: - # print(entry.title) - # print(entry.keys()) - # print('\n') - # # if 'tags' in entry: - # # print(entry.title, entry.tags) - - post_dir = os.path.join(output_dir, feed_name, slugify(entry.title)) - create_post(post_dir, entry) + # if 'tags' in entry: + # print(entry.title, entry.tags) + + post_name = slugify(entry.title) + post_dir = os.path.join(output_dir, feed_name, post_name) + + if post_name not in existing_posts: + #if there is a blog entry we dont already have, make it + create_post(post_dir, entry) + + elif post_name in existing_posts: + #if we already have it, update it + create_post(post_dir, entry) + existing_posts.remove(post_name) # create list of posts which have not been returned by the feed + + for post in existing_posts: + #remove blog posts no longer returned by the RSS feed + print('deleted', post) + shutil.rmtree(os.path.join(feed_dir, slugify(post)))