Browse Source

now it actually downloads loops

main
rra 2 years ago
parent
commit
9a10f4f97a
  1. 128
      download_loooooops.py

128
download_loooooops.py

@ -1,62 +1,96 @@
import requests import requests
from time import sleep from time import sleep
import datetime
import os
from urllib.parse import urlparse
import shutil
#def download_media(dir, url): #def download_media(dir, url):
# remote_url # remote_url
# description # description
output_dir = "/home/r/Programming/radio-looptober/loops"
def grab_media(path, url, filename):
media_item = urlparse(url).path.split('/')[-1]
headers = {
'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',
'From': 'post.lurk.org/@lurk' # This is another valid field
}
if not os.path.exists(os.path.join(path, media_item)):
response = requests.get(url, headers=headers, stream=True)
if response.ok:
with open(os.path.join(path, media_item), 'wb') as media_file:
shutil.copyfileobj(response.raw, media_file)
print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))
return media_item
#This pages through all the looptober tag and collects the json in 'data' #This pages through all the looptober tag and collects the json in 'data'
there_is_more = True there_is_more = True
url = "https://post.lurk.org/api/v1/timelines/tag/looptober" url = "https://post.lurk.org/api/v1/timelines/tag/looptober"
data = [] data = []
while there_is_more: while there_is_more:
print("downloading", url) print("downloading", url)
r = requests.get(url) r = requests.get(url)
print(r.status_code) print("response status: ", r.status_code)
if r.ok: if r.ok:
if r.content: if r.content:
data.append(r.json()) data.append(r.json())
print(len(data)) print("amount of pages:", len(data))
sleep(1) sleep(0.5)
if r.links: if r.links:
url = r.links["next"]["url"] url = r.links["next"]["url"]
print("found next url", url) print("found next url", url)
else: else:
print("no more data") print("no more data")
there_is_more = False there_is_more = False
break break
else: else:
break break
#this parses all the json, taking a few valuable fields and puts them in looooops #this parses all the json, taking a few valuable fields and puts them in looooops
looooops = [] looooops = []
for collection in data: for collection in data:
for i in collection: for i in collection:
if i["media_attachments"]: #we only take entries that actually contain a sound file if i["media_attachments"]: #we only take entries that actually contain a sound file
creation_date = datetime.datetime.fromisoformat( creation_date = datetime.datetime.fromisoformat(
i['created_at'][:-1]).astimezone( i['created_at'][:-1]).astimezone(
datetime.timezone.utc) datetime.timezone.utc)
if creation_date.strftime('%Y') == "2022": #we only take entries from this year if creation_date.strftime('%Y') == "2022": #we only take entries from this year
stuff = {} stuff = {}
stuff["url"] = i["url"] stuff["url"] = i["url"]
stuff["description"] = i["content"] stuff["description"] = i["content"]
stuff["audio"] = i["media_attachments"] stuff["audio"] = i["media_attachments"]
stuff["date"] = i["created_at"] stuff["date"] = i["created_at"]
stuff["id"] = i["id"] stuff["id"] = i["id"]
stuff["creator"] = i["account"]["username"] stuff["creator"] = i["account"]["username"]
looooops.append(stuff) looooops.append(stuff)
print("found post by {} with {} looops".format( print("found post by {} with {} looops".format(
i["account"]["username"], i["account"]["username"],
len(i["media_attachments"]))) len(i["media_attachments"])))
if not os.path.exists(output_dir):
#for l in looooops: os.mkdir(output_dir)
# create a folder per l, named id
# download the files in media_attachments using the remote_url for l in looooops:
# find a way to stuff metadata in the file path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))
if not os.path.exists(path):
os.mkdir(path)
print("\n")
print("Downloading looops by ***{}***".format(l['creator']))
for a in l['audio']:
if a['remote_url']:
url = a['remote_url']
else:
url = a['url']
grab_media(path, url)

Loading…
Cancel
Save