circulations prototypes

This commit is contained in:
mb 2023-03-10 12:28:34 +01:00
commit 6aaad14ff0
6 changed files with 570 additions and 0 deletions

431
aggregator.py Normal file
View File

@ -0,0 +1,431 @@
import os
import shutil
import time
from hashlib import md5
from ast import literal_eval as make_tuple
from pathlib import Path
from urllib.parse import urlparse
from re import sub
import arrow
import feedparser
import jinja2
import requests
from bs4 import BeautifulSoup
from slugify import slugify
from re import compile as re_compile
yamlre = re_compile('"')
import pprint
db = {}
template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
def write_etag(feed_name, feed_data):
"""
save timestamp of when feed was last modified
"""
etag = ""
modified = ""
if "etag" in feed_data:
etag = feed_data.etag
if "modified" in feed_data:
modified = feed_data.modified
if etag or modified:
with open(os.path.join("etags", feed_name + ".txt"), "w") as f:
f.write(str((etag, modified)))
def get_etag(feed_name):
"""
return timestamp of when feed was last modified
"""
fn = os.path.join("etags", feed_name + ".txt")
etag = ""
modified = ""
if os.path.exists(fn):
etag, modified = make_tuple(open(fn, "r").read())
return etag, modified
def create_frontmatter(entry):
"""
parse RSS metadata and return as frontmatter
"""
if 'published' in entry:
published = entry.published_parsed
elif 'updated' in entry:
published = entry.updated_parsed
if not published:
published = "2023-03-09T16:31:47.294841"
# !!! placeholder hack for now, to make this whole script work
published = arrow.get(published)
if 'author' in entry:
author = entry.author
else:
author = ''
if 'authors' in entry:
authors = []
for a in entry.authors:
authors.append(a['name'])
if 'summary' in entry:
summary = entry.summary
else:
summary = ''
if 'publisher' in entry:
publisher = entry.publisher
else:
publisher = ''
tags = []
if 'tags' in entry:
#TODO finish categories
for t in entry.tags:
tags.append(t['term'])
if "featured_image" in entry:
featured_image = entry.featured_image
else:
featured_image = ''
card_type = "network"
if entry.feed_name == "pen.lumbung.space":
card_type = "pen"
if "opds" in entry:
frontmatter = {
'title':entry.title,
'date': published.format(),
'summary': summary,
'author': ",".join(authors),
'publisher': publisher,
'original_link': entry.links[0]['href'].replace('opds/cover/','books/'),
'feed_name': entry['feed_name'],
'tags': str(tags),
'category': "books"
}
else:
frontmatter = {
'title':entry.title,
'date': published.format(),
'summary': '',
'author': author,
'original_link': entry.link,
'feed_name': entry['feed_name'],
'tags': str(tags),
'card_type': card_type,
'featured_image': featured_image
}
return frontmatter
def sanitize_yaml (frontmatter):
"""
Escapes any occurences of double quotes
in any of the frontmatter fields
See: https://docs.octoprint.org/en/master/configuration/yaml.html#interesting-data-types
"""
for k, v in frontmatter.items():
if type(v) == type([]):
#some fields are lists
l = []
for i in v:
i = yamlre.sub('\\"', i)
l.append(i)
frontmatter[k] = l
else:
v = yamlre.sub('\\"', v)
frontmatter[k] = v
return frontmatter
def parse_enclosures(post_dir, entry):
"""
Parses feed enclosures which are featured media
Can be featured image but also podcast entries
https://pythonhosted.org/feedparser/reference-entry-enclosures.html
"""
#TODO parse more than images
#TODO handle the fact it could be multiple items
for e in entry.enclosures:
if "type" in e:
print("found enclosed media", e.type)
if "image/" in e.type:
featured_image = grab_media(post_dir, e.href)
entry["featured_image"] = featured_image
else:
print("FIXME:ignoring enclosed", e.type)
return entry
def parse_content(post_dir, entry):
if "enclosures" in entry:
entry = parse_enclosures(post_dir, entry)
frontmatter = create_frontmatter(entry)
print(">>> frontmatter:", frontmatter)
if not os.path.exists(post_dir):
os.makedirs(post_dir)
if "content" in entry:
post_content = entry.content[0].value
else:
post_content = entry.summary
parsed_content = parse_posts(post_dir, post_content)
return parsed_content, frontmatter
def create_post(post_dir, parsed_content, frontmatter):
"""
write hugo post based on RSS entry
"""
template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
template = env.get_template("post.template.html")
with open(os.path.join(post_dir, "index.html"), "w") as f: # n.b. .html
post = template.render(frontmatter=sanitize_yaml(frontmatter), content=parsed_content)
f.write(post)
print("created post for " + frontmatter["title"] + " (" + frontmatter["original_link"] + ")")
def add_to_db(post_dir, parsed_content, frontmatter):
db[post_dir] = {}
db[post_dir]["content"] = parsed_content
db[post_dir]["frontmatter"] = frontmatter
def grab_media(post_directory, url, prefered_name=None):
"""
download media linked in post to have local copy
if download succeeds return new local path otherwise return url
"""
media_item = urlparse(url).path.split('/')[-1]
headers = {
'User-Agent': 'https://git.autonomic.zone/ruangrupa/lumbunglib',
'From': 'info@lumbung.space' # This is another valid field
}
if prefered_name:
media_item = prefered_name
try:
if not os.path.exists(os.path.join(post_directory, media_item)):
#TODO: stream is true is a conditional so we could check the headers for things, mimetype etc
response = requests.get(url, headers=headers, stream=True)
if response.ok:
with open(os.path.join(post_directory, media_item), 'wb') as media_file:
shutil.copyfileobj(response.raw, media_file)
print('Downloaded media item', media_item)
return media_item
else:
print("Download failed", response.status_code)
return url
return media_item
elif os.path.exists(os.path.join(post_directory, media_item)):
return media_item
except Exception as e:
print('Failed to download image', url)
print(e)
return url
def parse_posts(post_dir, post_content):
"""
parse the post content to for media items
replace foreign image with local copy
filter out iframe sources not in allowlist
"""
soup = BeautifulSoup(post_content, "html.parser")
allowed_iframe_sources = ["youtube.com", "vimeo.com", "tv.lumbung.space"]
for img in soup(["img", "object"]):
if img.get("src") != None:
local_image = grab_media(post_dir, img["src"])
if img["src"] != local_image:
img["src"] = local_image
for iframe in soup(["iframe"]):
if not any(source in iframe["src"] for source in allowed_iframe_sources):
print("filtered iframe: {}...".format(iframe["src"][:25]))
iframe.decompose()
return soup.decode()
def grab_feed(feed_url):
"""
check whether feed has been updated
download & return it if it has
"""
feed_name = urlparse(feed_url).netloc
etag, modified = get_etag(feed_name)
# !!! disabled for now, for testing
# try:
# if modified:
# data = feedparser.parse(feed_url, modified=modified)
# elif etag:
# data = feedparser.parse(feed_url, etag=etag)
# else:
# data = feedparser.parse(feed_url)
# except Exception as e:
# print("Error grabbing feed")
# print(feed_name)
# print(e)
# return False
data = feedparser.parse(feed_url)
if "status" in data:
print(data.status, feed_url)
if data.status == 200:
# 304 means the feed has not been modified since we last checked
write_etag(feed_name, data)
return data
return False
def main(output_dir):
feed_urls = open("feeds.txt", "r").read().splitlines()
start = time.time()
if not os.path.exists("etags"):
os.mkdir("etags")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
feed_dict = dict()
for url in feed_urls:
feed_name = urlparse(url).netloc
feed_dict[url] = feed_name
feed_names = feed_dict.values()
content_dirs = os.listdir(output_dir)
# for i in content_dirs:
# if i not in feed_names:
# shutil.rmtree(os.path.join(output_dir, i))
# print("%s not in feeds_list.txt, removing local data" %(i))
# add iframe to the allowlist of feedparser's sanitizer,
# this is now handled in parse_post()
# !!! disabled for now
# feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {"iframe"}
for feed_url in feed_urls:
print("\n>>>>>>>>>>>>>>>>>>>>>>\n")
feed_name = feed_dict[feed_url]
feed_dir = os.path.join(output_dir, feed_name)
if not os.path.exists(feed_dir):
os.makedirs(feed_dir)
existing_posts = os.listdir(feed_dir)
data = grab_feed(feed_url)
if data:
opds_feed = False
for i in data.feed['links']:
if i['rel'] == 'self':
if 'opds' in i['type']:
opds_feed = True
print("OPDS type feed!")
for entry in data.entries:
# if 'tags' in entry:
# for tag in entry.tags:
# for x in ['lumbung.space', 'D15', 'lumbung']:
# if x in tag['term']:
# print(entry.title)
entry["feed_name"] = feed_name
post_name = slugify(entry.title)
# pixelfed returns the whole post text as the post name. max
# filename length is 255 on many systems. here we're shortening
# the name and adding a hash to it to avoid a conflict in a
# situation where 2 posts start with exactly the same text.
if len(post_name) > 150:
post_hash = md5(bytes(post_name, "utf-8"))
post_name = post_name[:150] + "-" + post_hash.hexdigest()
if opds_feed:
entry['opds'] = True
#format: Beyond-Debiasing-Report_Online-75535a4886e3
post_name = slugify(entry['title'])+'-'+entry['id'].split('-')[-1]
post_dir = os.path.join(output_dir, feed_name, post_name)
post_dirs.append(post_dir)
if post_name not in existing_posts:
# if there is a blog entry we dont already have, make it
parsed_content, frontmatter = parse_content(post_dir, entry)
create_post(post_dir, parsed_content, frontmatter)
elif post_name in existing_posts:
# if we already have it, update it
parsed_content, frontmatter = parse_content(post_dir, entry)
create_post(post_dir, parsed_content, frontmatter)
# create list of posts which have not been returned by the feed
existing_posts.remove(post_name)
# add this post to the db
add_to_db(post_dir, parsed_content, frontmatter)
# !!! disabled for now for testing
# for post in existing_posts:
# # remove blog posts no longer returned by the RSS feed
# print("deleted", post)
# shutil.rmtree(os.path.join(feed_dir, slugify(post)))
print("\n----------------------\n")
end = time.time()
print(end - start)
if __name__ == "__main__":
post_dirs = []
output_dir = "feed-materials"
main(output_dir)
print("\n>>> db:")
pprint.pprint(db)
template = env.get_template("index.template.html")
output_file = 'index.html'
with open(output_file,'w') as f:
index = template.render(db=db)
f.write(index)
print('>>> written:', output_file)

3
feeds.txt Normal file
View File

@ -0,0 +1,3 @@
https://vvvvvvaria.org/logs/dislog/feed.rss.xml
https://etherdump.constantvzw.org/recentchanges.rss
http://darkwiki.stuff2233.club/dislogging/index.rss

9
requirements.txt Normal file
View File

@ -0,0 +1,9 @@
Jinja2>=3.0.3<4.0.0
Mastodon.py>=1.5.1<2.0.0
bs4>=0.0.1<0.0.2
feedparser>=6.0.8<7.0.0
ics>=0.7<0.8
natural>=0.2.0<0.3.0
python-slugify>=5.0.2<6.0.0
requests>=2.26.0<3.0.0
pprintpp==0.4.0

20
stylesheet.css Normal file
View File

@ -0,0 +1,20 @@
summary:hover{
cursor: pointer;
}
iframe{
width: calc(100% - 25px);
height: 500px;
border: 0;
background-color: rgba(220,220,220,0.4);
}
table{
width: 100%;
}
table,
th,
td {
border: 1px solid;
}
th:hover{
cursor: pointer;
}

View File

@ -0,0 +1,105 @@
<DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<link rel="stylesheet" type="text/css" href="stylesheet.css">
</head>
<body>
<div id="wrapper">
<!-- -->
<h1>circulations (1)</h1>
{% for post_dir, post in db.items() %}
<div class="post">
<pre>---</pre>
<strong>{{ post.frontmatter.title }}</strong>
<div>
<a href="{{ post_dir }}">aggregated</a>
<a href="{{ post.frontmatter.original_link }}">source</a>
</div>
<small>{{ post.frontmatter.feed_name }}</small><br>
<small>{{ post.frontmatter.date }}</small><br>
<small>{{ post.frontmatter.author }}</small>
<details>
<summary>
<small>post</small>
</summary>
<iframe src="{{ post_dir }}"></iframe>
</details>
</div>
{% endfor %}
<br>
<br>
<hr>
<!-- -->
<h1>circulations (2)</h1>
<table id="circulations">
<thead>
<tr>
<th onclick="sortTable(0)">title</th>
<th onclick="sortTable(1)">post</th>
<th onclick="sortTable(2)">feed</th>
<th onclick="sortTable(3)">date</th>
<th onclick="sortTable(4)">through</th>
</tr>
</thead>
<tbody>
{% for post_dir, post in db.items() %}
<tr>
<td>{{ post.frontmatter.title }}</td>
<td>
<a href="{{ post_dir }}">aggregated</a>
<a href="{{ post.frontmatter.original_link }}">source</a>
</td>
<td>{{ post.frontmatter.feed_name }}</td>
<td>{{ post.frontmatter.date }}</td>
<td>{{ post.frontmatter.author }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<script>
function sortTable(n) {
var table, rows, switching, i, x, y, shouldSwitch, dir, switchcount = 0;
table = document.getElementById("circulations");
switching = true;
dir = "asc";
while (switching) {
switching = false;
rows = table.rows;
for (i = 1; i < (rows.length - 1); i++) {
shouldSwitch = false;
x = rows[i].getElementsByTagName("TD")[n];
y = rows[i + 1].getElementsByTagName("TD")[n];
if (dir == "asc") {
if (x.innerHTML.toLowerCase() > y.innerHTML.toLowerCase()) {
shouldSwitch = true;
break;
}
} else if (dir == "desc") {
if (x.innerHTML.toLowerCase() < y.innerHTML.toLowerCase()) {
shouldSwitch = true;
break;
}
}
}
if (shouldSwitch) {
rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
switching = true;
switchcount ++;
} else {
if (switchcount == 0 && dir == "asc") {
dir = "desc";
switching = true;
}
}
}
}
</script>
</div>
</body>
</html>

View File

@ -0,0 +1,2 @@
<small class="frontmatter">{{ frontmatter }}</small>
<div class="post">{{ content }}</div>