From f950f037927f4c93df1a479a69a762a533645ccc Mon Sep 17 00:00:00 2001 From: rra Date: Sat, 5 Feb 2022 21:38:26 +0100 Subject: [PATCH] handle OPDS feeds from calibre-web --- rss_aggregator.py | 149 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 25 deletions(-) diff --git a/rss_aggregator.py b/rss_aggregator.py index cd27c29..b584b20 100644 --- a/rss_aggregator.py +++ b/rss_aggregator.py @@ -1,7 +1,7 @@ #!/bin/python3 -#lumbung.space rss feed aggregator -#© 2021 roel roscam abbing gplv3 etc +#lumbung.space rss & opds feed parser +#© 2022 roel roscam abbing agplv3 etc import requests import jinja2 @@ -60,6 +60,21 @@ def create_frontmatter(entry): author = entry.author else: author = '' + + if 'authors' in entry: + authors = [] + for a in entry.authors: + authors.append(a['name']) + + if 'summary' in entry: + summary = entry.summary + else: + summary = '' + + if 'publisher' in entry: + publisher = entry.publisher + else: + publisher = '' tags = [] if 'tags' in entry: @@ -67,15 +82,28 @@ def create_frontmatter(entry): for t in entry.tags: tags.append(t['term']) - frontmatter = { - 'title':entry.title, - 'date': published.format(), - 'summary': '', - 'author': author, - 'original_link': entry.link, - 'feed_name': entry['feed_name'], - 'tags': str(tags) - } + if "opds" in entry: + frontmatter = { + 'title':entry.title, + 'date': published.format(), + 'summary': summary, + 'author': ",".join(authors), + 'publisher': publisher, + 'original_link': entry.links[0]['href'].replace('opds/cover/','books/'), + 'feed_name': entry['feed_name'], + 'tags': str(tags), + 'category': "books" + } + else: + frontmatter = { + 'title':entry.title, + 'date': published.format(), + 'summary': '', + 'author': author, + 'original_link': entry.link, + 'feed_name': entry['feed_name'], + 'tags': str(tags) + } return frontmatter @@ -100,32 +128,34 @@ def create_post(post_dir, entry): f.write(post) print('created post for', entry.title, '({})'.format(entry.link)) -def grab_media(post_directory, url): +def grab_media(post_directory, url, prefered_name): """ download media linked in post to have local copy if download succeeds return new local path otherwise return url """ - image = urlparse(url).path.split('/')[-1] + media_item = urlparse(url).path.split('/')[-1] + + if prefered_name: + media_item = prefered_name try: - if not os.path.exists(os.path.join(post_directory, image)): + if not os.path.exists(os.path.join(post_directory, media_item)): #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc response = requests.get(url, stream=True) if response.ok: - with open(os.path.join(post_directory, image), 'wb') as img_file: - shutil.copyfileobj(response.raw, img_file) - print('Downloaded cover image', image) - return image - return image - elif os.path.exists(os.path.join(post_directory, image)): - return image + with open(os.path.join(post_directory, media_item), 'wb') as media_file: + shutil.copyfileobj(response.raw, media_file) + print('Downloaded media item', media_item) + return media_item + return media_item + elif os.path.exists(os.path.join(post_directory, media_item)): + return media_item except Exception as e: print('Failed to download image', url) print(e) return url - def parse_posts(post_dir, post_content): """ parse the post content to for media items @@ -147,6 +177,54 @@ def parse_posts(post_dir, post_content): iframe.decompose() return soup.decode() +def create_opds_post(post_dir, entry): + """ + create a HUGO post based on OPDS entry + or update it if the timestamp is newer + Downloads the cover & file + """ + + frontmatter = create_frontmatter(entry) + + if not os.path.exists(post_dir): + os.makedirs(post_dir) + + if os.path.exists(os.path.join(post_dir, '.timestamp')): + old_timestamp = open(os.path.join(post_dir, '.timestamp')).read() + old_timestamp = arrow.get(float(old_timestamp)) + current_timestamp = arrow.get(entry['updated_parsed']) + + if current_timestamp > old_timestamp: + pass + else: + print('Book "{}..." already up to date'.format(entry['title'][:32])) + return + + for item in entry.links: + ft = item['type'].split('/')[-1] + fn = item['rel'].split('/')[-1] + + if fn == "acquisition": + fn = "publication" #calling the publications acquisition is weird + + prefered_name = "{}-{}.{}".format(fn, slugify(entry['title']), ft) + + grab_media(post_dir, item['href'], prefered_name) + + if "summary" in entry: + summary = entry.summary + else: + summary = "" + + with open(os.path.join(post_dir,'index.md'),'w') as f: + post = template.render(frontmatter=frontmatter, content=summary) + f.write(post) + print('created post for Book', entry.title) + + with open(os.path.join(post_dir, '.timestamp'), 'w') as f: + timestamp = arrow.get(entry['updated_parsed']) + f.write(timestamp.format('X')) + def grab_feed(feed_url): """ check whether feed has been updated @@ -219,6 +297,14 @@ for feed_url in feed_urls: data = grab_feed(feed_url) if data: + + opds_feed = False + for i in data.feed['links']: + if i['rel'] == 'self': + if 'opds' in i['type']: + opds_feed = True + print("opds!") + for entry in data.entries: # if 'tags' in entry: # for tag in entry.tags: @@ -228,15 +314,28 @@ for feed_url in feed_urls: entry['feed_name'] = feed_name post_name = slugify(entry.title) + + if opds_feed: + entry['opds'] = True + #format: Beyond-Debiasing-Report_Online-75535a4886e3 + post_name = slugify(entry['title'])+'-'+entry['id'].split('-')[-1] + post_dir = os.path.join(output_dir, feed_name, post_name) - + + if post_name not in existing_posts: #if there is a blog entry we dont already have, make it - create_post(post_dir, entry) + if opds_feed: + create_opds_post(post_dir, entry) + else: + create_post(post_dir, entry) elif post_name in existing_posts: #if we already have it, update it - create_post(post_dir, entry) + if opds_feed: + create_opds_post(post_dir, entry) + else: + create_post(post_dir, entry) existing_posts.remove(post_name) # create list of posts which have not been returned by the feed for post in existing_posts: