From 6aaad14ff0fadf2f2a76ef0fbd4dfec23e130b13 Mon Sep 17 00:00:00 2001 From: mb Date: Fri, 10 Mar 2023 12:28:34 +0100 Subject: [PATCH] circulations prototypes --- aggregator.py | 431 ++++++++++++++++++++++++++++++++++ feeds.txt | 3 + requirements.txt | 9 + stylesheet.css | 20 ++ templates/index.template.html | 105 +++++++++ templates/post.template.html | 2 + 6 files changed, 570 insertions(+) create mode 100644 aggregator.py create mode 100644 feeds.txt create mode 100644 requirements.txt create mode 100644 stylesheet.css create mode 100644 templates/index.template.html create mode 100644 templates/post.template.html diff --git a/aggregator.py b/aggregator.py new file mode 100644 index 0000000..f0c4afc --- /dev/null +++ b/aggregator.py @@ -0,0 +1,431 @@ +import os +import shutil +import time +from hashlib import md5 +from ast import literal_eval as make_tuple +from pathlib import Path +from urllib.parse import urlparse +from re import sub + +import arrow +import feedparser +import jinja2 +import requests +from bs4 import BeautifulSoup +from slugify import slugify +from re import compile as re_compile +yamlre = re_compile('"') + +import pprint + +db = {} + +template_dir = os.path.join(Path(__file__).parent.resolve(), "templates") +env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir)) + +def write_etag(feed_name, feed_data): + """ + save timestamp of when feed was last modified + """ + etag = "" + modified = "" + + if "etag" in feed_data: + etag = feed_data.etag + if "modified" in feed_data: + modified = feed_data.modified + + if etag or modified: + with open(os.path.join("etags", feed_name + ".txt"), "w") as f: + f.write(str((etag, modified))) + + +def get_etag(feed_name): + """ + return timestamp of when feed was last modified + """ + fn = os.path.join("etags", feed_name + ".txt") + etag = "" + modified = "" + + if os.path.exists(fn): + etag, modified = make_tuple(open(fn, "r").read()) + + return etag, modified + + +def create_frontmatter(entry): + """ + parse RSS metadata and return as frontmatter + """ + if 'published' in entry: + published = entry.published_parsed + elif 'updated' in entry: + published = entry.updated_parsed + + if not published: + published = "2023-03-09T16:31:47.294841" + # !!! placeholder hack for now, to make this whole script work + + published = arrow.get(published) + + if 'author' in entry: + author = entry.author + else: + author = '' + + if 'authors' in entry: + authors = [] + for a in entry.authors: + authors.append(a['name']) + + if 'summary' in entry: + summary = entry.summary + else: + summary = '' + + if 'publisher' in entry: + publisher = entry.publisher + else: + publisher = '' + + tags = [] + if 'tags' in entry: + #TODO finish categories + for t in entry.tags: + tags.append(t['term']) + + if "featured_image" in entry: + featured_image = entry.featured_image + else: + featured_image = '' + + card_type = "network" + if entry.feed_name == "pen.lumbung.space": + card_type = "pen" + + if "opds" in entry: + frontmatter = { + 'title':entry.title, + 'date': published.format(), + 'summary': summary, + 'author': ",".join(authors), + 'publisher': publisher, + 'original_link': entry.links[0]['href'].replace('opds/cover/','books/'), + 'feed_name': entry['feed_name'], + 'tags': str(tags), + 'category': "books" + } + else: + frontmatter = { + 'title':entry.title, + 'date': published.format(), + 'summary': '', + 'author': author, + 'original_link': entry.link, + 'feed_name': entry['feed_name'], + 'tags': str(tags), + 'card_type': card_type, + 'featured_image': featured_image + } + + return frontmatter + +def sanitize_yaml (frontmatter): + """ + Escapes any occurences of double quotes + in any of the frontmatter fields + See: https://docs.octoprint.org/en/master/configuration/yaml.html#interesting-data-types + """ + for k, v in frontmatter.items(): + if type(v) == type([]): + #some fields are lists + l = [] + for i in v: + i = yamlre.sub('\\"', i) + l.append(i) + frontmatter[k] = l + + else: + v = yamlre.sub('\\"', v) + frontmatter[k] = v + + return frontmatter + +def parse_enclosures(post_dir, entry): + """ + Parses feed enclosures which are featured media + Can be featured image but also podcast entries + https://pythonhosted.org/feedparser/reference-entry-enclosures.html + """ + #TODO parse more than images + #TODO handle the fact it could be multiple items + + for e in entry.enclosures: + if "type" in e: + print("found enclosed media", e.type) + if "image/" in e.type: + featured_image = grab_media(post_dir, e.href) + entry["featured_image"] = featured_image + else: + print("FIXME:ignoring enclosed", e.type) + return entry + +def parse_content(post_dir, entry): + if "enclosures" in entry: + entry = parse_enclosures(post_dir, entry) + + frontmatter = create_frontmatter(entry) + print(">>> frontmatter:", frontmatter) + + if not os.path.exists(post_dir): + os.makedirs(post_dir) + + if "content" in entry: + post_content = entry.content[0].value + else: + post_content = entry.summary + + parsed_content = parse_posts(post_dir, post_content) + + return parsed_content, frontmatter + +def create_post(post_dir, parsed_content, frontmatter): + """ + write hugo post based on RSS entry + """ + template_dir = os.path.join(Path(__file__).parent.resolve(), "templates") + env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir)) + template = env.get_template("post.template.html") + + with open(os.path.join(post_dir, "index.html"), "w") as f: # n.b. .html + post = template.render(frontmatter=sanitize_yaml(frontmatter), content=parsed_content) + f.write(post) + print("created post for " + frontmatter["title"] + " (" + frontmatter["original_link"] + ")") + + +def add_to_db(post_dir, parsed_content, frontmatter): + db[post_dir] = {} + db[post_dir]["content"] = parsed_content + db[post_dir]["frontmatter"] = frontmatter + + +def grab_media(post_directory, url, prefered_name=None): + """ + download media linked in post to have local copy + if download succeeds return new local path otherwise return url + """ + media_item = urlparse(url).path.split('/')[-1] + + headers = { + 'User-Agent': 'https://git.autonomic.zone/ruangrupa/lumbunglib', + 'From': 'info@lumbung.space' # This is another valid field + } + if prefered_name: + media_item = prefered_name + + try: + if not os.path.exists(os.path.join(post_directory, media_item)): + #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc + response = requests.get(url, headers=headers, stream=True) + if response.ok: + with open(os.path.join(post_directory, media_item), 'wb') as media_file: + shutil.copyfileobj(response.raw, media_file) + print('Downloaded media item', media_item) + return media_item + else: + print("Download failed", response.status_code) + return url + return media_item + elif os.path.exists(os.path.join(post_directory, media_item)): + return media_item + + except Exception as e: + print('Failed to download image', url) + print(e) + + return url + + +def parse_posts(post_dir, post_content): + """ + parse the post content to for media items + replace foreign image with local copy + filter out iframe sources not in allowlist + """ + soup = BeautifulSoup(post_content, "html.parser") + allowed_iframe_sources = ["youtube.com", "vimeo.com", "tv.lumbung.space"] + + for img in soup(["img", "object"]): + if img.get("src") != None: + local_image = grab_media(post_dir, img["src"]) + if img["src"] != local_image: + img["src"] = local_image + + for iframe in soup(["iframe"]): + if not any(source in iframe["src"] for source in allowed_iframe_sources): + print("filtered iframe: {}...".format(iframe["src"][:25])) + iframe.decompose() + + return soup.decode() + + +def grab_feed(feed_url): + """ + check whether feed has been updated + download & return it if it has + """ + feed_name = urlparse(feed_url).netloc + + etag, modified = get_etag(feed_name) + + # !!! disabled for now, for testing + # try: + # if modified: + # data = feedparser.parse(feed_url, modified=modified) + # elif etag: + # data = feedparser.parse(feed_url, etag=etag) + # else: + # data = feedparser.parse(feed_url) + # except Exception as e: + # print("Error grabbing feed") + # print(feed_name) + # print(e) + # return False + + data = feedparser.parse(feed_url) + + if "status" in data: + print(data.status, feed_url) + if data.status == 200: + # 304 means the feed has not been modified since we last checked + write_etag(feed_name, data) + return data + return False + + +def main(output_dir): + feed_urls = open("feeds.txt", "r").read().splitlines() + + start = time.time() + + if not os.path.exists("etags"): + os.mkdir("etags") + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + feed_dict = dict() + for url in feed_urls: + feed_name = urlparse(url).netloc + feed_dict[url] = feed_name + + feed_names = feed_dict.values() + content_dirs = os.listdir(output_dir) + # for i in content_dirs: + # if i not in feed_names: + # shutil.rmtree(os.path.join(output_dir, i)) + # print("%s not in feeds_list.txt, removing local data" %(i)) + + # add iframe to the allowlist of feedparser's sanitizer, + # this is now handled in parse_post() + + # !!! disabled for now + # feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {"iframe"} + + for feed_url in feed_urls: + + print("\n>>>>>>>>>>>>>>>>>>>>>>\n") + + feed_name = feed_dict[feed_url] + + feed_dir = os.path.join(output_dir, feed_name) + + if not os.path.exists(feed_dir): + os.makedirs(feed_dir) + + existing_posts = os.listdir(feed_dir) + + data = grab_feed(feed_url) + + if data: + + opds_feed = False + for i in data.feed['links']: + if i['rel'] == 'self': + if 'opds' in i['type']: + opds_feed = True + print("OPDS type feed!") + + + for entry in data.entries: + # if 'tags' in entry: + # for tag in entry.tags: + # for x in ['lumbung.space', 'D15', 'lumbung']: + # if x in tag['term']: + # print(entry.title) + entry["feed_name"] = feed_name + + post_name = slugify(entry.title) + + # pixelfed returns the whole post text as the post name. max + # filename length is 255 on many systems. here we're shortening + # the name and adding a hash to it to avoid a conflict in a + # situation where 2 posts start with exactly the same text. + if len(post_name) > 150: + post_hash = md5(bytes(post_name, "utf-8")) + post_name = post_name[:150] + "-" + post_hash.hexdigest() + + if opds_feed: + entry['opds'] = True + #format: Beyond-Debiasing-Report_Online-75535a4886e3 + post_name = slugify(entry['title'])+'-'+entry['id'].split('-')[-1] + + post_dir = os.path.join(output_dir, feed_name, post_name) + post_dirs.append(post_dir) + + if post_name not in existing_posts: + # if there is a blog entry we dont already have, make it + parsed_content, frontmatter = parse_content(post_dir, entry) + create_post(post_dir, parsed_content, frontmatter) + + elif post_name in existing_posts: + # if we already have it, update it + parsed_content, frontmatter = parse_content(post_dir, entry) + create_post(post_dir, parsed_content, frontmatter) + + # create list of posts which have not been returned by the feed + existing_posts.remove(post_name) + + # add this post to the db + add_to_db(post_dir, parsed_content, frontmatter) + + # !!! disabled for now for testing + # for post in existing_posts: + # # remove blog posts no longer returned by the RSS feed + # print("deleted", post) + # shutil.rmtree(os.path.join(feed_dir, slugify(post))) + + + print("\n----------------------\n") + + end = time.time() + print(end - start) + +if __name__ == "__main__": + + post_dirs = [] + output_dir = "feed-materials" + + main(output_dir) + + print("\n>>> db:") + pprint.pprint(db) + + template = env.get_template("index.template.html") + + output_file = 'index.html' + with open(output_file,'w') as f: + index = template.render(db=db) + f.write(index) + print('>>> written:', output_file) \ No newline at end of file diff --git a/feeds.txt b/feeds.txt new file mode 100644 index 0000000..77634c9 --- /dev/null +++ b/feeds.txt @@ -0,0 +1,3 @@ +https://vvvvvvaria.org/logs/dislog/feed.rss.xml +https://etherdump.constantvzw.org/recentchanges.rss +http://darkwiki.stuff2233.club/dislogging/index.rss diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a4a5ef --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +Jinja2>=3.0.3<4.0.0 +Mastodon.py>=1.5.1<2.0.0 +bs4>=0.0.1<0.0.2 +feedparser>=6.0.8<7.0.0 +ics>=0.7<0.8 +natural>=0.2.0<0.3.0 +python-slugify>=5.0.2<6.0.0 +requests>=2.26.0<3.0.0 +pprintpp==0.4.0 \ No newline at end of file diff --git a/stylesheet.css b/stylesheet.css new file mode 100644 index 0000000..b8c63dc --- /dev/null +++ b/stylesheet.css @@ -0,0 +1,20 @@ +summary:hover{ + cursor: pointer; +} +iframe{ + width: calc(100% - 25px); + height: 500px; + border: 0; + background-color: rgba(220,220,220,0.4); +} +table{ + width: 100%; +} +table, +th, +td { + border: 1px solid; +} +th:hover{ + cursor: pointer; +} \ No newline at end of file diff --git a/templates/index.template.html b/templates/index.template.html new file mode 100644 index 0000000..407a23d --- /dev/null +++ b/templates/index.template.html @@ -0,0 +1,105 @@ + + + + + + + +
+ + +

circulations (1)

+ {% for post_dir, post in db.items() %} +
+
---
+ {{ post.frontmatter.title }} +
+ aggregated + source +
+ {{ post.frontmatter.feed_name }}
+ {{ post.frontmatter.date }}
+ {{ post.frontmatter.author }} +
+ + post + + +
+
+ {% endfor %} + +
+
+
+ + +

circulations (2)

+ + + + + + + + + + + + {% for post_dir, post in db.items() %} + + + + + + + + {% endfor %} + +
titlepostfeeddatethrough
{{ post.frontmatter.title }} + aggregated + source + {{ post.frontmatter.feed_name }}{{ post.frontmatter.date }}{{ post.frontmatter.author }}
+ + + +
+ + \ No newline at end of file diff --git a/templates/post.template.html b/templates/post.template.html new file mode 100644 index 0000000..fd2d32c --- /dev/null +++ b/templates/post.template.html @@ -0,0 +1,2 @@ +{{ frontmatter }} +
{{ content }}
\ No newline at end of file