circulations prototypes

2023-03-10 12:28:34 +01:00 · 2023-03-10 12:28:34 +01:00 · 6aaad14ff0
commit 6aaad14ff0
6 changed files with 570 additions and 0 deletions
--- a/aggregator.py
+++ b/aggregator.py
@ -0,0 +1,431 @@
+import os
+import shutil
+import time
+from hashlib import md5
+from ast import literal_eval as make_tuple
+from pathlib import Path
+from urllib.parse import urlparse
+from re import sub
+
+import arrow
+import feedparser
+import jinja2
+import requests
+from bs4 import BeautifulSoup
+from slugify import slugify
+from re import compile as re_compile
+yamlre = re_compile('"')
+
+import pprint
+
+db = {}
+
+template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
+env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
+
+def write_etag(feed_name, feed_data):
+    """
+    save timestamp of when feed was last modified
+    """
+    etag = ""
+    modified = ""
+
+    if "etag" in feed_data:
+        etag = feed_data.etag
+    if "modified" in feed_data:
+        modified = feed_data.modified
+
+    if etag or modified:
+        with open(os.path.join("etags", feed_name + ".txt"), "w") as f:
+            f.write(str((etag, modified)))
+
+
+def get_etag(feed_name):
+    """
+    return timestamp of when feed was last modified
+    """
+    fn = os.path.join("etags", feed_name + ".txt")
+    etag = ""
+    modified = ""
+
+    if os.path.exists(fn):
+        etag, modified = make_tuple(open(fn, "r").read())
+
+    return etag, modified
+
+
+def create_frontmatter(entry):
+    """
+    parse RSS metadata and return as frontmatter
+    """
+    if 'published' in entry:
+        published = entry.published_parsed
+    elif 'updated' in entry:
+        published = entry.updated_parsed
+
+    if not published:
+        published = "2023-03-09T16:31:47.294841" 
+        # !!! placeholder hack for now, to make this whole script work
+
+    published = arrow.get(published)
+
+    if 'author' in entry:
+        author = entry.author
+    else:
+        author = ''
+
+    if 'authors' in entry:
+        authors = []
+        for a in entry.authors:
+            authors.append(a['name'])
+
+    if 'summary' in entry:
+        summary = entry.summary
+    else:
+        summary = ''
+
+    if 'publisher' in entry:
+        publisher = entry.publisher
+    else:
+        publisher = ''
+
+    tags = []
+    if 'tags' in entry:
+        #TODO finish categories
+        for t in entry.tags:
+            tags.append(t['term'])
+
+    if "featured_image" in entry:
+        featured_image = entry.featured_image
+    else:
+        featured_image = ''
+
+    card_type = "network"
+    if entry.feed_name == "pen.lumbung.space":
+        card_type = "pen"
+
+    if "opds" in entry:
+        frontmatter = {
+        'title':entry.title,
+        'date': published.format(),
+        'summary': summary,
+        'author': ",".join(authors),
+        'publisher': publisher,
+        'original_link': entry.links[0]['href'].replace('opds/cover/','books/'),
+        'feed_name': entry['feed_name'],
+        'tags': str(tags),
+        'category': "books"
+        }
+    else:
+        frontmatter = {
+        'title':entry.title,
+        'date': published.format(),
+        'summary': '',
+        'author': author,
+        'original_link': entry.link,
+        'feed_name': entry['feed_name'],
+        'tags': str(tags),
+        'card_type': card_type,
+        'featured_image': featured_image
+        }
+
+    return frontmatter
+
+def sanitize_yaml (frontmatter):
+    """
+    Escapes any occurences of double quotes
+    in any of the frontmatter fields
+    See: https://docs.octoprint.org/en/master/configuration/yaml.html#interesting-data-types
+    """
+    for k, v in frontmatter.items():
+        if type(v) == type([]):
+            #some fields are lists
+            l = []
+            for i in v:
+                i = yamlre.sub('\\"', i)
+                l.append(i)
+            frontmatter[k] = l
+
+        else:
+            v = yamlre.sub('\\"', v)
+            frontmatter[k] = v
+
+    return frontmatter
+
+def parse_enclosures(post_dir, entry):
+    """
+    Parses feed enclosures which are featured media
+    Can be featured image but also podcast entries
+    https://pythonhosted.org/feedparser/reference-entry-enclosures.html
+    """
+    #TODO parse more than images
+    #TODO handle the fact it could be multiple items
+
+    for e in entry.enclosures:
+        if "type" in e:
+            print("found enclosed media", e.type)
+            if "image/" in e.type:
+                featured_image = grab_media(post_dir, e.href)
+                entry["featured_image"] = featured_image
+            else:
+                print("FIXME:ignoring enclosed", e.type)
+    return entry
+
+def parse_content(post_dir, entry):
+    if "enclosures" in entry:
+        entry = parse_enclosures(post_dir, entry)
+
+    frontmatter = create_frontmatter(entry)
+    print(">>> frontmatter:", frontmatter)
+
+    if not os.path.exists(post_dir):
+        os.makedirs(post_dir)
+
+    if "content" in entry:
+        post_content = entry.content[0].value
+    else:
+        post_content = entry.summary
+
+    parsed_content = parse_posts(post_dir, post_content)
+
+    return parsed_content, frontmatter
+
+def create_post(post_dir, parsed_content, frontmatter):
+    """
+    write hugo post based on RSS entry
+    """
+    template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
+    env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
+    template = env.get_template("post.template.html")
+
+    with open(os.path.join(post_dir, "index.html"), "w") as f:  # n.b. .html
+        post = template.render(frontmatter=sanitize_yaml(frontmatter), content=parsed_content)
+        f.write(post)
+        print("created post for " + frontmatter["title"] + " (" + frontmatter["original_link"] + ")")
+
+
+def add_to_db(post_dir, parsed_content, frontmatter):
+    db[post_dir] = {}
+    db[post_dir]["content"] = parsed_content
+    db[post_dir]["frontmatter"] = frontmatter
+
+
+def grab_media(post_directory, url, prefered_name=None):
+    """
+    download media linked in post to have local copy
+    if download succeeds return new local path otherwise return url
+    """
+    media_item = urlparse(url).path.split('/')[-1]
+
+    headers = {
+    'User-Agent': 'https://git.autonomic.zone/ruangrupa/lumbunglib',
+    'From': 'info@lumbung.space'  # This is another valid field
+    }
+    if prefered_name:
+        media_item = prefered_name
+
+    try:
+        if not os.path.exists(os.path.join(post_directory, media_item)):
+            #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc
+            response = requests.get(url, headers=headers, stream=True)
+            if response.ok:
+                with open(os.path.join(post_directory, media_item), 'wb') as media_file:
+                    shutil.copyfileobj(response.raw, media_file)
+                    print('Downloaded media item', media_item)
+                    return media_item
+            else:
+                print("Download failed", response.status_code)
+                return url
+            return media_item
+        elif os.path.exists(os.path.join(post_directory, media_item)):
+            return media_item
+
+    except Exception as e:
+        print('Failed to download image', url)
+        print(e)
+
+    return url
+
+
+def parse_posts(post_dir, post_content):
+    """
+    parse the post content to for media items
+    replace foreign image with local copy
+    filter out iframe sources not in allowlist
+    """
+    soup = BeautifulSoup(post_content, "html.parser")
+    allowed_iframe_sources = ["youtube.com", "vimeo.com", "tv.lumbung.space"]
+
+    for img in soup(["img", "object"]):
+        if img.get("src") != None:
+            local_image = grab_media(post_dir, img["src"])
+            if img["src"] != local_image:
+                img["src"] = local_image
+
+    for iframe in soup(["iframe"]):
+        if not any(source in iframe["src"] for source in allowed_iframe_sources):
+            print("filtered iframe: {}...".format(iframe["src"][:25]))
+            iframe.decompose()
+
+    return soup.decode()
+
+
+def grab_feed(feed_url):
+    """
+    check whether feed has been updated
+    download & return it if it has
+    """
+    feed_name = urlparse(feed_url).netloc
+
+    etag, modified = get_etag(feed_name)
+
+    # !!! disabled for now, for testing
+    # try:
+    #     if modified:
+    #         data = feedparser.parse(feed_url, modified=modified)
+    #     elif etag:
+    #         data = feedparser.parse(feed_url, etag=etag)
+    #     else:
+    #         data = feedparser.parse(feed_url)
+    # except Exception as e:
+    #     print("Error grabbing feed")
+    #     print(feed_name)
+    #     print(e)
+    #     return False
+
+    data = feedparser.parse(feed_url)
+
+    if "status" in data:
+        print(data.status, feed_url)
+        if data.status == 200:
+            # 304 means the feed has not been modified since we last checked
+            write_etag(feed_name, data)
+            return data
+    return False
+
+
+def main(output_dir):
+    feed_urls = open("feeds.txt", "r").read().splitlines()
+
+    start = time.time()
+
+    if not os.path.exists("etags"):
+        os.mkdir("etags")
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    feed_dict = dict()
+    for url in feed_urls:
+        feed_name = urlparse(url).netloc
+        feed_dict[url] = feed_name
+
+    feed_names = feed_dict.values()
+    content_dirs = os.listdir(output_dir)
+    # for i in content_dirs:
+    #     if i not in feed_names:
+    #         shutil.rmtree(os.path.join(output_dir, i))
+    #         print("%s not in feeds_list.txt, removing local data" %(i))
+
+    # add iframe to the allowlist of feedparser's sanitizer,
+    # this is now handled in parse_post()
+
+    # !!! disabled for now
+    # feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {"iframe"}
+
+    for feed_url in feed_urls:
+
+        print("\n>>>>>>>>>>>>>>>>>>>>>>\n")
+
+        feed_name = feed_dict[feed_url]
+
+        feed_dir = os.path.join(output_dir, feed_name)
+
+        if not os.path.exists(feed_dir):
+            os.makedirs(feed_dir)
+
+        existing_posts = os.listdir(feed_dir)
+
+        data = grab_feed(feed_url)
+
+        if data:
+
+            opds_feed = False
+            for i in data.feed['links']:
+                if i['rel'] == 'self':
+                    if 'opds' in i['type']:
+                        opds_feed = True
+                        print("OPDS type feed!")
+
+
+            for entry in data.entries:
+                # if 'tags' in entry:
+                #     for tag in entry.tags:
+                #        for x in ['lumbung.space', 'D15', 'lumbung']:
+                #            if x in tag['term']:
+                #                print(entry.title)
+                entry["feed_name"] = feed_name
+
+                post_name = slugify(entry.title)
+
+                # pixelfed returns the whole post text as the post name. max
+                # filename length is 255 on many systems. here we're shortening
+                # the name and adding a hash to it to avoid a conflict in a
+                # situation where 2 posts start with exactly the same text.
+                if len(post_name) > 150:
+                    post_hash = md5(bytes(post_name, "utf-8"))
+                    post_name = post_name[:150] + "-" + post_hash.hexdigest()
+
+                if opds_feed:
+                    entry['opds'] = True
+                    #format: Beyond-Debiasing-Report_Online-75535a4886e3
+                    post_name = slugify(entry['title'])+'-'+entry['id'].split('-')[-1]
+
+                post_dir = os.path.join(output_dir, feed_name, post_name)
+                post_dirs.append(post_dir)
+
+                if post_name not in existing_posts:
+                    # if there is a blog entry we dont already have, make it
+                    parsed_content, frontmatter = parse_content(post_dir, entry)
+                    create_post(post_dir, parsed_content, frontmatter)
+
+                elif post_name in existing_posts:
+                    # if we already have it, update it
+                    parsed_content, frontmatter = parse_content(post_dir, entry)
+                    create_post(post_dir, parsed_content, frontmatter)
+
+                    # create list of posts which have not been returned by the feed
+                    existing_posts.remove(post_name)  
+
+                # add this post to the db
+                add_to_db(post_dir, parsed_content, frontmatter)
+
+            # !!! disabled for now for testing
+            # for post in existing_posts:
+            #     # remove blog posts no longer returned by the RSS feed
+            #     print("deleted", post)
+            #     shutil.rmtree(os.path.join(feed_dir, slugify(post)))
+
+
+    print("\n----------------------\n")
+
+    end = time.time()
+    print(end - start)
+
+if __name__ == "__main__":
+
+    post_dirs = []
+    output_dir = "feed-materials"
+    
+    main(output_dir)
+
+    print("\n>>> db:")
+    pprint.pprint(db)
+
+    template = env.get_template("index.template.html")
+
+    output_file = 'index.html'
+    with open(output_file,'w') as f:
+        index = template.render(db=db)
+        f.write(index)
+        print('>>> written:', output_file)
--- a/feeds.txt
+++ b/feeds.txt
@ -0,0 +1,3 @@
+https://vvvvvvaria.org/logs/dislog/feed.rss.xml
+https://etherdump.constantvzw.org/recentchanges.rss
+http://darkwiki.stuff2233.club/dislogging/index.rss
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+Jinja2>=3.0.3<4.0.0
+Mastodon.py>=1.5.1<2.0.0
+bs4>=0.0.1<0.0.2
+feedparser>=6.0.8<7.0.0
+ics>=0.7<0.8
+natural>=0.2.0<0.3.0
+python-slugify>=5.0.2<6.0.0
+requests>=2.26.0<3.0.0
+pprintpp==0.4.0
--- a/stylesheet.css
+++ b/stylesheet.css
@ -0,0 +1,20 @@
+summary:hover{
+	cursor: pointer;
+}
+iframe{
+	width: calc(100% - 25px);
+	height: 500px;
+	border: 0;
+	background-color: rgba(220,220,220,0.4);
+}
+table{
+	width: 100%;
+}
+table,
+th,
+td {
+  border: 1px solid;
+}
+th:hover{
+	cursor: pointer;
+}
--- a/templates/index.template.html
+++ b/templates/index.template.html
@ -0,0 +1,105 @@
+<DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <link rel="stylesheet" type="text/css" href="stylesheet.css">
+</head>
+<body>
+<div id="wrapper">
+
+	<!--  -->
+    <h1>circulations (1)</h1>
+    {% for post_dir, post in db.items() %}
+    <div class="post">
+        <pre>---</pre>
+        <strong>{{ post.frontmatter.title }}</strong>
+        <div>
+            <a href="{{ post_dir }}">aggregated</a>
+            <a href="{{ post.frontmatter.original_link }}">source</a>
+        </div>
+        <small>{{ post.frontmatter.feed_name }}</small><br>
+        <small>{{ post.frontmatter.date }}</small><br>
+        <small>{{ post.frontmatter.author }}</small>
+        <details>
+            <summary>
+                <small>post</small>
+            </summary>
+            <iframe src="{{ post_dir }}"></iframe>
+        </details>
+    </div>
+    {% endfor %}
+
+    <br>
+    <br>
+    <hr>
+
+	<!--  -->
+    <h1>circulations (2)</h1>
+    <table id="circulations">
+    <thead>
+        <tr>
+            <th onclick="sortTable(0)">title</th>
+            <th onclick="sortTable(1)">post</th>
+            <th onclick="sortTable(2)">feed</th>
+            <th onclick="sortTable(3)">date</th>
+            <th onclick="sortTable(4)">through</th>
+        </tr>
+    </thead>
+    <tbody>
+        {% for post_dir, post in db.items() %}
+        <tr>
+            <td>{{ post.frontmatter.title }}</td>
+            <td>
+                <a href="{{ post_dir }}">aggregated</a>
+                <a href="{{ post.frontmatter.original_link }}">source</a>
+            </td>
+            <td>{{ post.frontmatter.feed_name }}</td>
+            <td>{{ post.frontmatter.date }}</td>
+            <td>{{ post.frontmatter.author }}</td>
+        </tr>
+        {% endfor %}
+    </tbody>
+</table>
+
+
+<script>
+function sortTable(n) {
+    var table, rows, switching, i, x, y, shouldSwitch, dir, switchcount = 0;
+    table = document.getElementById("circulations");
+    switching = true;
+    dir = "asc";
+    while (switching) {
+        switching = false;
+        rows = table.rows;
+        for (i = 1; i < (rows.length - 1); i++) {
+            shouldSwitch = false;
+            x = rows[i].getElementsByTagName("TD")[n];
+            y = rows[i + 1].getElementsByTagName("TD")[n];
+            if (dir == "asc") {
+                if (x.innerHTML.toLowerCase() > y.innerHTML.toLowerCase()) {
+                    shouldSwitch = true;
+                    break;
+                }
+            } else if (dir == "desc") {
+                if (x.innerHTML.toLowerCase() < y.innerHTML.toLowerCase()) {
+                    shouldSwitch = true;
+                    break;
+                }
+            }
+        }
+        if (shouldSwitch) {
+            rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
+            switching = true;
+            switchcount ++;
+        } else {
+            if (switchcount == 0 && dir == "asc") {
+                dir = "desc";
+                switching = true;
+            }
+        }
+    }
+}
+</script>
+</div>
+</body>
+</html>
--- a/templates/post.template.html
+++ b/templates/post.template.html
@ -0,0 +1,2 @@
+<small class="frontmatter">{{ frontmatter }}</small>
+<div class="post">{{ content }}</div>