circulations prototypes
This commit is contained in:
commit
6aaad14ff0
431
aggregator.py
Normal file
431
aggregator.py
Normal file
@ -0,0 +1,431 @@
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from hashlib import md5
|
||||
from ast import literal_eval as make_tuple
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from re import sub
|
||||
|
||||
import arrow
|
||||
import feedparser
|
||||
import jinja2
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from slugify import slugify
|
||||
from re import compile as re_compile
|
||||
yamlre = re_compile('"')
|
||||
|
||||
import pprint
|
||||
|
||||
db = {}
|
||||
|
||||
template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
|
||||
|
||||
def write_etag(feed_name, feed_data):
|
||||
"""
|
||||
save timestamp of when feed was last modified
|
||||
"""
|
||||
etag = ""
|
||||
modified = ""
|
||||
|
||||
if "etag" in feed_data:
|
||||
etag = feed_data.etag
|
||||
if "modified" in feed_data:
|
||||
modified = feed_data.modified
|
||||
|
||||
if etag or modified:
|
||||
with open(os.path.join("etags", feed_name + ".txt"), "w") as f:
|
||||
f.write(str((etag, modified)))
|
||||
|
||||
|
||||
def get_etag(feed_name):
|
||||
"""
|
||||
return timestamp of when feed was last modified
|
||||
"""
|
||||
fn = os.path.join("etags", feed_name + ".txt")
|
||||
etag = ""
|
||||
modified = ""
|
||||
|
||||
if os.path.exists(fn):
|
||||
etag, modified = make_tuple(open(fn, "r").read())
|
||||
|
||||
return etag, modified
|
||||
|
||||
|
||||
def create_frontmatter(entry):
|
||||
"""
|
||||
parse RSS metadata and return as frontmatter
|
||||
"""
|
||||
if 'published' in entry:
|
||||
published = entry.published_parsed
|
||||
elif 'updated' in entry:
|
||||
published = entry.updated_parsed
|
||||
|
||||
if not published:
|
||||
published = "2023-03-09T16:31:47.294841"
|
||||
# !!! placeholder hack for now, to make this whole script work
|
||||
|
||||
published = arrow.get(published)
|
||||
|
||||
if 'author' in entry:
|
||||
author = entry.author
|
||||
else:
|
||||
author = ''
|
||||
|
||||
if 'authors' in entry:
|
||||
authors = []
|
||||
for a in entry.authors:
|
||||
authors.append(a['name'])
|
||||
|
||||
if 'summary' in entry:
|
||||
summary = entry.summary
|
||||
else:
|
||||
summary = ''
|
||||
|
||||
if 'publisher' in entry:
|
||||
publisher = entry.publisher
|
||||
else:
|
||||
publisher = ''
|
||||
|
||||
tags = []
|
||||
if 'tags' in entry:
|
||||
#TODO finish categories
|
||||
for t in entry.tags:
|
||||
tags.append(t['term'])
|
||||
|
||||
if "featured_image" in entry:
|
||||
featured_image = entry.featured_image
|
||||
else:
|
||||
featured_image = ''
|
||||
|
||||
card_type = "network"
|
||||
if entry.feed_name == "pen.lumbung.space":
|
||||
card_type = "pen"
|
||||
|
||||
if "opds" in entry:
|
||||
frontmatter = {
|
||||
'title':entry.title,
|
||||
'date': published.format(),
|
||||
'summary': summary,
|
||||
'author': ",".join(authors),
|
||||
'publisher': publisher,
|
||||
'original_link': entry.links[0]['href'].replace('opds/cover/','books/'),
|
||||
'feed_name': entry['feed_name'],
|
||||
'tags': str(tags),
|
||||
'category': "books"
|
||||
}
|
||||
else:
|
||||
frontmatter = {
|
||||
'title':entry.title,
|
||||
'date': published.format(),
|
||||
'summary': '',
|
||||
'author': author,
|
||||
'original_link': entry.link,
|
||||
'feed_name': entry['feed_name'],
|
||||
'tags': str(tags),
|
||||
'card_type': card_type,
|
||||
'featured_image': featured_image
|
||||
}
|
||||
|
||||
return frontmatter
|
||||
|
||||
def sanitize_yaml (frontmatter):
|
||||
"""
|
||||
Escapes any occurences of double quotes
|
||||
in any of the frontmatter fields
|
||||
See: https://docs.octoprint.org/en/master/configuration/yaml.html#interesting-data-types
|
||||
"""
|
||||
for k, v in frontmatter.items():
|
||||
if type(v) == type([]):
|
||||
#some fields are lists
|
||||
l = []
|
||||
for i in v:
|
||||
i = yamlre.sub('\\"', i)
|
||||
l.append(i)
|
||||
frontmatter[k] = l
|
||||
|
||||
else:
|
||||
v = yamlre.sub('\\"', v)
|
||||
frontmatter[k] = v
|
||||
|
||||
return frontmatter
|
||||
|
||||
def parse_enclosures(post_dir, entry):
|
||||
"""
|
||||
Parses feed enclosures which are featured media
|
||||
Can be featured image but also podcast entries
|
||||
https://pythonhosted.org/feedparser/reference-entry-enclosures.html
|
||||
"""
|
||||
#TODO parse more than images
|
||||
#TODO handle the fact it could be multiple items
|
||||
|
||||
for e in entry.enclosures:
|
||||
if "type" in e:
|
||||
print("found enclosed media", e.type)
|
||||
if "image/" in e.type:
|
||||
featured_image = grab_media(post_dir, e.href)
|
||||
entry["featured_image"] = featured_image
|
||||
else:
|
||||
print("FIXME:ignoring enclosed", e.type)
|
||||
return entry
|
||||
|
||||
def parse_content(post_dir, entry):
|
||||
if "enclosures" in entry:
|
||||
entry = parse_enclosures(post_dir, entry)
|
||||
|
||||
frontmatter = create_frontmatter(entry)
|
||||
print(">>> frontmatter:", frontmatter)
|
||||
|
||||
if not os.path.exists(post_dir):
|
||||
os.makedirs(post_dir)
|
||||
|
||||
if "content" in entry:
|
||||
post_content = entry.content[0].value
|
||||
else:
|
||||
post_content = entry.summary
|
||||
|
||||
parsed_content = parse_posts(post_dir, post_content)
|
||||
|
||||
return parsed_content, frontmatter
|
||||
|
||||
def create_post(post_dir, parsed_content, frontmatter):
|
||||
"""
|
||||
write hugo post based on RSS entry
|
||||
"""
|
||||
template_dir = os.path.join(Path(__file__).parent.resolve(), "templates")
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir))
|
||||
template = env.get_template("post.template.html")
|
||||
|
||||
with open(os.path.join(post_dir, "index.html"), "w") as f: # n.b. .html
|
||||
post = template.render(frontmatter=sanitize_yaml(frontmatter), content=parsed_content)
|
||||
f.write(post)
|
||||
print("created post for " + frontmatter["title"] + " (" + frontmatter["original_link"] + ")")
|
||||
|
||||
|
||||
def add_to_db(post_dir, parsed_content, frontmatter):
|
||||
db[post_dir] = {}
|
||||
db[post_dir]["content"] = parsed_content
|
||||
db[post_dir]["frontmatter"] = frontmatter
|
||||
|
||||
|
||||
def grab_media(post_directory, url, prefered_name=None):
|
||||
"""
|
||||
download media linked in post to have local copy
|
||||
if download succeeds return new local path otherwise return url
|
||||
"""
|
||||
media_item = urlparse(url).path.split('/')[-1]
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'https://git.autonomic.zone/ruangrupa/lumbunglib',
|
||||
'From': 'info@lumbung.space' # This is another valid field
|
||||
}
|
||||
if prefered_name:
|
||||
media_item = prefered_name
|
||||
|
||||
try:
|
||||
if not os.path.exists(os.path.join(post_directory, media_item)):
|
||||
#TODO: stream is true is a conditional so we could check the headers for things, mimetype etc
|
||||
response = requests.get(url, headers=headers, stream=True)
|
||||
if response.ok:
|
||||
with open(os.path.join(post_directory, media_item), 'wb') as media_file:
|
||||
shutil.copyfileobj(response.raw, media_file)
|
||||
print('Downloaded media item', media_item)
|
||||
return media_item
|
||||
else:
|
||||
print("Download failed", response.status_code)
|
||||
return url
|
||||
return media_item
|
||||
elif os.path.exists(os.path.join(post_directory, media_item)):
|
||||
return media_item
|
||||
|
||||
except Exception as e:
|
||||
print('Failed to download image', url)
|
||||
print(e)
|
||||
|
||||
return url
|
||||
|
||||
|
||||
def parse_posts(post_dir, post_content):
|
||||
"""
|
||||
parse the post content to for media items
|
||||
replace foreign image with local copy
|
||||
filter out iframe sources not in allowlist
|
||||
"""
|
||||
soup = BeautifulSoup(post_content, "html.parser")
|
||||
allowed_iframe_sources = ["youtube.com", "vimeo.com", "tv.lumbung.space"]
|
||||
|
||||
for img in soup(["img", "object"]):
|
||||
if img.get("src") != None:
|
||||
local_image = grab_media(post_dir, img["src"])
|
||||
if img["src"] != local_image:
|
||||
img["src"] = local_image
|
||||
|
||||
for iframe in soup(["iframe"]):
|
||||
if not any(source in iframe["src"] for source in allowed_iframe_sources):
|
||||
print("filtered iframe: {}...".format(iframe["src"][:25]))
|
||||
iframe.decompose()
|
||||
|
||||
return soup.decode()
|
||||
|
||||
|
||||
def grab_feed(feed_url):
|
||||
"""
|
||||
check whether feed has been updated
|
||||
download & return it if it has
|
||||
"""
|
||||
feed_name = urlparse(feed_url).netloc
|
||||
|
||||
etag, modified = get_etag(feed_name)
|
||||
|
||||
# !!! disabled for now, for testing
|
||||
# try:
|
||||
# if modified:
|
||||
# data = feedparser.parse(feed_url, modified=modified)
|
||||
# elif etag:
|
||||
# data = feedparser.parse(feed_url, etag=etag)
|
||||
# else:
|
||||
# data = feedparser.parse(feed_url)
|
||||
# except Exception as e:
|
||||
# print("Error grabbing feed")
|
||||
# print(feed_name)
|
||||
# print(e)
|
||||
# return False
|
||||
|
||||
data = feedparser.parse(feed_url)
|
||||
|
||||
if "status" in data:
|
||||
print(data.status, feed_url)
|
||||
if data.status == 200:
|
||||
# 304 means the feed has not been modified since we last checked
|
||||
write_etag(feed_name, data)
|
||||
return data
|
||||
return False
|
||||
|
||||
|
||||
def main(output_dir):
|
||||
feed_urls = open("feeds.txt", "r").read().splitlines()
|
||||
|
||||
start = time.time()
|
||||
|
||||
if not os.path.exists("etags"):
|
||||
os.mkdir("etags")
|
||||
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
feed_dict = dict()
|
||||
for url in feed_urls:
|
||||
feed_name = urlparse(url).netloc
|
||||
feed_dict[url] = feed_name
|
||||
|
||||
feed_names = feed_dict.values()
|
||||
content_dirs = os.listdir(output_dir)
|
||||
# for i in content_dirs:
|
||||
# if i not in feed_names:
|
||||
# shutil.rmtree(os.path.join(output_dir, i))
|
||||
# print("%s not in feeds_list.txt, removing local data" %(i))
|
||||
|
||||
# add iframe to the allowlist of feedparser's sanitizer,
|
||||
# this is now handled in parse_post()
|
||||
|
||||
# !!! disabled for now
|
||||
# feedparser.sanitizer._HTMLSanitizer.acceptable_elements |= {"iframe"}
|
||||
|
||||
for feed_url in feed_urls:
|
||||
|
||||
print("\n>>>>>>>>>>>>>>>>>>>>>>\n")
|
||||
|
||||
feed_name = feed_dict[feed_url]
|
||||
|
||||
feed_dir = os.path.join(output_dir, feed_name)
|
||||
|
||||
if not os.path.exists(feed_dir):
|
||||
os.makedirs(feed_dir)
|
||||
|
||||
existing_posts = os.listdir(feed_dir)
|
||||
|
||||
data = grab_feed(feed_url)
|
||||
|
||||
if data:
|
||||
|
||||
opds_feed = False
|
||||
for i in data.feed['links']:
|
||||
if i['rel'] == 'self':
|
||||
if 'opds' in i['type']:
|
||||
opds_feed = True
|
||||
print("OPDS type feed!")
|
||||
|
||||
|
||||
for entry in data.entries:
|
||||
# if 'tags' in entry:
|
||||
# for tag in entry.tags:
|
||||
# for x in ['lumbung.space', 'D15', 'lumbung']:
|
||||
# if x in tag['term']:
|
||||
# print(entry.title)
|
||||
entry["feed_name"] = feed_name
|
||||
|
||||
post_name = slugify(entry.title)
|
||||
|
||||
# pixelfed returns the whole post text as the post name. max
|
||||
# filename length is 255 on many systems. here we're shortening
|
||||
# the name and adding a hash to it to avoid a conflict in a
|
||||
# situation where 2 posts start with exactly the same text.
|
||||
if len(post_name) > 150:
|
||||
post_hash = md5(bytes(post_name, "utf-8"))
|
||||
post_name = post_name[:150] + "-" + post_hash.hexdigest()
|
||||
|
||||
if opds_feed:
|
||||
entry['opds'] = True
|
||||
#format: Beyond-Debiasing-Report_Online-75535a4886e3
|
||||
post_name = slugify(entry['title'])+'-'+entry['id'].split('-')[-1]
|
||||
|
||||
post_dir = os.path.join(output_dir, feed_name, post_name)
|
||||
post_dirs.append(post_dir)
|
||||
|
||||
if post_name not in existing_posts:
|
||||
# if there is a blog entry we dont already have, make it
|
||||
parsed_content, frontmatter = parse_content(post_dir, entry)
|
||||
create_post(post_dir, parsed_content, frontmatter)
|
||||
|
||||
elif post_name in existing_posts:
|
||||
# if we already have it, update it
|
||||
parsed_content, frontmatter = parse_content(post_dir, entry)
|
||||
create_post(post_dir, parsed_content, frontmatter)
|
||||
|
||||
# create list of posts which have not been returned by the feed
|
||||
existing_posts.remove(post_name)
|
||||
|
||||
# add this post to the db
|
||||
add_to_db(post_dir, parsed_content, frontmatter)
|
||||
|
||||
# !!! disabled for now for testing
|
||||
# for post in existing_posts:
|
||||
# # remove blog posts no longer returned by the RSS feed
|
||||
# print("deleted", post)
|
||||
# shutil.rmtree(os.path.join(feed_dir, slugify(post)))
|
||||
|
||||
|
||||
print("\n----------------------\n")
|
||||
|
||||
end = time.time()
|
||||
print(end - start)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
post_dirs = []
|
||||
output_dir = "feed-materials"
|
||||
|
||||
main(output_dir)
|
||||
|
||||
print("\n>>> db:")
|
||||
pprint.pprint(db)
|
||||
|
||||
template = env.get_template("index.template.html")
|
||||
|
||||
output_file = 'index.html'
|
||||
with open(output_file,'w') as f:
|
||||
index = template.render(db=db)
|
||||
f.write(index)
|
||||
print('>>> written:', output_file)
|
3
feeds.txt
Normal file
3
feeds.txt
Normal file
@ -0,0 +1,3 @@
|
||||
https://vvvvvvaria.org/logs/dislog/feed.rss.xml
|
||||
https://etherdump.constantvzw.org/recentchanges.rss
|
||||
http://darkwiki.stuff2233.club/dislogging/index.rss
|
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@ -0,0 +1,9 @@
|
||||
Jinja2>=3.0.3<4.0.0
|
||||
Mastodon.py>=1.5.1<2.0.0
|
||||
bs4>=0.0.1<0.0.2
|
||||
feedparser>=6.0.8<7.0.0
|
||||
ics>=0.7<0.8
|
||||
natural>=0.2.0<0.3.0
|
||||
python-slugify>=5.0.2<6.0.0
|
||||
requests>=2.26.0<3.0.0
|
||||
pprintpp==0.4.0
|
20
stylesheet.css
Normal file
20
stylesheet.css
Normal file
@ -0,0 +1,20 @@
|
||||
summary:hover{
|
||||
cursor: pointer;
|
||||
}
|
||||
iframe{
|
||||
width: calc(100% - 25px);
|
||||
height: 500px;
|
||||
border: 0;
|
||||
background-color: rgba(220,220,220,0.4);
|
||||
}
|
||||
table{
|
||||
width: 100%;
|
||||
}
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
border: 1px solid;
|
||||
}
|
||||
th:hover{
|
||||
cursor: pointer;
|
||||
}
|
105
templates/index.template.html
Normal file
105
templates/index.template.html
Normal file
@ -0,0 +1,105 @@
|
||||
<DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<link rel="stylesheet" type="text/css" href="stylesheet.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="wrapper">
|
||||
|
||||
<!-- -->
|
||||
<h1>circulations (1)</h1>
|
||||
{% for post_dir, post in db.items() %}
|
||||
<div class="post">
|
||||
<pre>---</pre>
|
||||
<strong>{{ post.frontmatter.title }}</strong>
|
||||
<div>
|
||||
<a href="{{ post_dir }}">aggregated</a>
|
||||
<a href="{{ post.frontmatter.original_link }}">source</a>
|
||||
</div>
|
||||
<small>{{ post.frontmatter.feed_name }}</small><br>
|
||||
<small>{{ post.frontmatter.date }}</small><br>
|
||||
<small>{{ post.frontmatter.author }}</small>
|
||||
<details>
|
||||
<summary>
|
||||
<small>post</small>
|
||||
</summary>
|
||||
<iframe src="{{ post_dir }}"></iframe>
|
||||
</details>
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
<br>
|
||||
<br>
|
||||
<hr>
|
||||
|
||||
<!-- -->
|
||||
<h1>circulations (2)</h1>
|
||||
<table id="circulations">
|
||||
<thead>
|
||||
<tr>
|
||||
<th onclick="sortTable(0)">title</th>
|
||||
<th onclick="sortTable(1)">post</th>
|
||||
<th onclick="sortTable(2)">feed</th>
|
||||
<th onclick="sortTable(3)">date</th>
|
||||
<th onclick="sortTable(4)">through</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for post_dir, post in db.items() %}
|
||||
<tr>
|
||||
<td>{{ post.frontmatter.title }}</td>
|
||||
<td>
|
||||
<a href="{{ post_dir }}">aggregated</a>
|
||||
<a href="{{ post.frontmatter.original_link }}">source</a>
|
||||
</td>
|
||||
<td>{{ post.frontmatter.feed_name }}</td>
|
||||
<td>{{ post.frontmatter.date }}</td>
|
||||
<td>{{ post.frontmatter.author }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
|
||||
<script>
|
||||
function sortTable(n) {
|
||||
var table, rows, switching, i, x, y, shouldSwitch, dir, switchcount = 0;
|
||||
table = document.getElementById("circulations");
|
||||
switching = true;
|
||||
dir = "asc";
|
||||
while (switching) {
|
||||
switching = false;
|
||||
rows = table.rows;
|
||||
for (i = 1; i < (rows.length - 1); i++) {
|
||||
shouldSwitch = false;
|
||||
x = rows[i].getElementsByTagName("TD")[n];
|
||||
y = rows[i + 1].getElementsByTagName("TD")[n];
|
||||
if (dir == "asc") {
|
||||
if (x.innerHTML.toLowerCase() > y.innerHTML.toLowerCase()) {
|
||||
shouldSwitch = true;
|
||||
break;
|
||||
}
|
||||
} else if (dir == "desc") {
|
||||
if (x.innerHTML.toLowerCase() < y.innerHTML.toLowerCase()) {
|
||||
shouldSwitch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (shouldSwitch) {
|
||||
rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
|
||||
switching = true;
|
||||
switchcount ++;
|
||||
} else {
|
||||
if (switchcount == 0 && dir == "asc") {
|
||||
dir = "desc";
|
||||
switching = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
2
templates/post.template.html
Normal file
2
templates/post.template.html
Normal file
@ -0,0 +1,2 @@
|
||||
<small class="frontmatter">{{ frontmatter }}</small>
|
||||
<div class="post">{{ content }}</div>
|
Loading…
Reference in New Issue
Block a user