|
|
|
#!/bin/env python3
|
|
|
|
|
|
|
|
import requests
|
|
|
|
from time import sleep
|
|
|
|
import datetime
|
|
|
|
import os
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
#def download_media(dir, url):
|
|
|
|
# remote_url
|
|
|
|
# description
|
|
|
|
|
|
|
|
output_dir = "/home/r/Programming/radio-looptober/loops"
|
|
|
|
|
|
|
|
def grab_media(path, url, filename):
|
|
|
|
|
|
|
|
media_item = urlparse(url).path.split('/')[-1]
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',
|
|
|
|
'From': 'post.lurk.org/@lurk' # This is another valid field
|
|
|
|
}
|
|
|
|
|
|
|
|
if not os.path.exists(os.path.join(path, media_item)):
|
|
|
|
response = requests.get(url, headers=headers, stream=True)
|
|
|
|
if response.ok:
|
|
|
|
with open(os.path.join(path, media_item), 'wb') as media_file:
|
|
|
|
shutil.copyfileobj(response.raw, media_file)
|
|
|
|
print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))
|
|
|
|
return media_item
|
|
|
|
|
|
|
|
#This pages through all the looptober tag and collects the json in 'data'
|
|
|
|
there_is_more = True
|
|
|
|
url = "https://post.lurk.org/api/v1/timelines/tag/looptober"
|
|
|
|
data = []
|
|
|
|
while there_is_more:
|
|
|
|
print("downloading", url)
|
|
|
|
r = requests.get(url)
|
|
|
|
print("response status: ", r.status_code)
|
|
|
|
if r.ok:
|
|
|
|
if r.content:
|
|
|
|
|
|
|
|
data.append(r.json())
|
|
|
|
print("amount of pages:", len(data))
|
|
|
|
sleep(0.5)
|
|
|
|
|
|
|
|
if r.links:
|
|
|
|
url = r.links["next"]["url"]
|
|
|
|
print("found next url", url)
|
|
|
|
|
|
|
|
else:
|
|
|
|
print("no more data")
|
|
|
|
there_is_more = False
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
#this parses all the json, taking a few valuable fields and puts them in looooops
|
|
|
|
looooops = []
|
|
|
|
for collection in data:
|
|
|
|
for i in collection:
|
|
|
|
if i["media_attachments"]: #we only take entries that actually contain a sound file
|
|
|
|
creation_date = datetime.datetime.fromisoformat(
|
|
|
|
i['created_at'][:-1]).astimezone(
|
|
|
|
datetime.timezone.utc)
|
|
|
|
|
|
|
|
if creation_date.strftime('%Y') == "2022": #we only take entries from this year
|
|
|
|
stuff = {}
|
|
|
|
stuff["url"] = i["url"]
|
|
|
|
stuff["description"] = i["content"]
|
|
|
|
stuff["audio"] = i["media_attachments"]
|
|
|
|
stuff["date"] = i["created_at"]
|
|
|
|
stuff["id"] = i["id"]
|
|
|
|
stuff["creator"] = i["account"]["username"]
|
|
|
|
looooops.append(stuff)
|
|
|
|
print("found post by {} with {} looops".format(
|
|
|
|
i["account"]["username"],
|
|
|
|
len(i["media_attachments"])))
|
|
|
|
|
|
|
|
if not os.path.exists(output_dir):
|
|
|
|
os.mkdir(output_dir)
|
|
|
|
|
|
|
|
for l in looooops:
|
|
|
|
path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))
|
|
|
|
if not os.path.exists(path):
|
|
|
|
os.mkdir(path)
|
|
|
|
|
|
|
|
print("\n")
|
|
|
|
print("Downloading looops by ***{}***".format(l['creator']))
|
|
|
|
for a in l['audio']:
|
|
|
|
if a['remote_url']:
|
|
|
|
url = a['remote_url']
|
|
|
|
else:
|
|
|
|
url = a['url']
|
|
|
|
|
|
|
|
grab_media(path, url)
|
|
|
|
|