commit 6f7786346969d5ad8f262ef9cb6acfa96e71afe7 Author: rra Date: Wed Aug 18 21:41:38 2021 +0200 first prototype RSS aggregator diff --git a/feeds_list.txt b/feeds_list.txt new file mode 100644 index 0000000..8ca02c0 --- /dev/null +++ b/feeds_list.txt @@ -0,0 +1,12 @@ +https://www.masartemasaccion.org/feed/ +https://fafswag.wordpress.com/feed/ +https://wajukuuarts.wordpress.com/feed/ +https://sakakini.org/feed/ +https://inland.org/feed/ +https://jatiwangiartfactory.tumblr.com/feed/ +https://brittoartstrust.org/feed/ +https://artivismo.org/feed/ +http://www.festivalsegou.org/spip.php?page=backend&lang=fr +https://gudskul.art/feed/ +https://projectartworks.org/feed/ +https://ruangrupa.id/feed/ \ No newline at end of file diff --git a/post_template.md b/post_template.md new file mode 100644 index 0000000..2dc0adc --- /dev/null +++ b/post_template.md @@ -0,0 +1,11 @@ +--- +title: "{{ frontmatter.title }}" +date: "{{ frontmatter.date }}" #2021-06-10T10:46:33+02:00 +draft: false +summary: "{{ frontmatter.summary }}" +author: "{{ frontmatter.author }}" +original_link: "{{ frontmatter.link }}" + +--- + +{{ content }} \ No newline at end of file diff --git a/rss_aggregator.py b/rss_aggregator.py new file mode 100644 index 0000000..13672b5 --- /dev/null +++ b/rss_aggregator.py @@ -0,0 +1,195 @@ +#!/bin/python3 + +#lumbung.space rss feed aggregator +#© 2021 roel roscam abbing gplv3 etc + +import requests +import jinja2 +import os +import shutil +import feedparser +from urllib.parse import urlparse +from ast import literal_eval as make_tuple +from slugify import slugify +from bs4 import BeautifulSoup +import time +import arrow + + +def write_etag(feed_name, feed_data): + """ + save timestamp of when feed was last modified + """ + etag = '' + modified = '' + + if 'etag' in data: + etag = data.etag + if 'modified' in data: + modified = data.modified + + if etag or modified: + with open(os.path.join('etags',feed_name +'.txt'),'w') as f: + f.write(str((etag, modified))) + +def get_etag(feed_name): + """ + return timestamp of when feed was last modified + """ + fn = os.path.join('etags',feed_name +'.txt') + etag = '' + modified = '' + + if os.path.exists(fn): + etag, modified = make_tuple(open(fn,'r').read()) + + return etag, modified + +def create_frontmatter(entry): + """ + parse RSS metadata and return as frontmatter + """ + if 'published' in entry: + published = entry.published_parsed + if 'updated' in entry: + published = entry.updated_parsed + + published = arrow.get(published) + + frontmatter = { + 'title':entry.title, + 'date': published.format(), + 'summary': '', + 'author': entry.author, + 'original_link': entry.link + } + + return frontmatter + +def create_post(post_dir, entry): + """ + write hugo post based on RSS entry + """ + frontmatter = create_frontmatter(entry) + + if not os.path.exists(post_dir): + os.makedirs(post_dir) + + post_content = entry.content[0].value + + parsed_content = parse_posts(post_dir, post_content) + + with open(os.path.join(post_dir,'index.html'),'w') as f: + post = template.render(frontmatter=frontmatter, content=parsed_content) + f.write(post) + print('created post for', entry.title, '({})'.format(entry.link)) + +def grab_media(post_directory, url): + """ + download media linked in post to have local copy + if download succeeds return new local path otherwise return url + """ + image = urlparse(url).path.split('/')[-1] + + try: + #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc + response = requests.get(url, stream=True) + except Exception as e: + print(e) + return url + + try: + if not os.path.exists(os.path.join(post_directory, image)): + with open(os.path.join(post_directory, image), 'wb') as img_file: + shutil.copyfileobj(response.raw, img_file) + print('Downloaded cover image', image) + return image + return image + + except Exception as e: + print('Failed to download cover image', url) + print(e) + return url + + +def parse_posts(post_direntry, post_content): + """ + parse the post content to for media items + replace foreign media item with local copy + """ + soup = BeautifulSoup(post_content, "html.parser") + media = [] + for img in soup(['img','object']): + + local_image = grab_media(post_dir, img['src']) + + if img['src'] != local_image: + print(img['src'], '->', local_image) + img['src'] = local_image + return soup.decode() + + +feed_urls = open('feeds_list.txt','r').read().splitlines() + +start = time.time() + +if not os.path.exists('etags'): + os.mkdir('etags') + + +env = jinja2.Environment( + loader=jinja2.FileSystemLoader(os.path.curdir) + ) + +output_dir = os.environ.get('OUTPUT_DIR', '/home/r/Programming/lumbung.space/lumbung.space-web/content/posts/') +#output_dir = os.environ.get('OUTPUT_DIR', 'network/') + +if not os.path.exists(output_dir): + os.makedirs(output_dir) + +template = env.get_template('post_template.md') + + +for feed_url in feed_urls[7:]: + feed_name = urlparse(feed_url).netloc + + etag, modified = get_etag(feed_name) + if modified: + data = feedparser.parse(feed_url, modified=modified) + elif etag: + data = feedparser.parse(feed_url, etag=etag) + else: + data = feedparser.parse(feed_url) + + print(data.status, feed_url) + + if data.status == 200: + + #write_etag(feed_url, data) + + # if 'title' in data.feed: + # print('#'*10) + # print(data.feed.title) + # print('#'*10) + # print('\n') + + # print('FEED KEYS') + # print(data.keys()) + # print('\n') + + for entry in data.entries: + # print(entry.title) + # print(entry.keys()) + # print('\n') + # # if 'tags' in entry: + # # print(entry.title, entry.tags) + + post_dir = os.path.join(output_dir, feed_name, slugify(entry.title)) + create_post(post_dir, entry) + + + +end = time.time() + +print(end - start) +