diff --git a/download_loooooops.py b/download_loooooops.py index dde6dcf..5dfc11b 100644 --- a/download_loooooops.py +++ b/download_loooooops.py @@ -1,62 +1,96 @@ import requests from time import sleep - +import datetime +import os +from urllib.parse import urlparse +import shutil #def download_media(dir, url): -# remote_url -# description +# remote_url +# description + +output_dir = "/home/r/Programming/radio-looptober/loops" + +def grab_media(path, url, filename): + + media_item = urlparse(url).path.split('/')[-1] + headers = { + 'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober', + 'From': 'post.lurk.org/@lurk' # This is another valid field + } + + if not os.path.exists(os.path.join(path, media_item)): + response = requests.get(url, headers=headers, stream=True) + if response.ok: + with open(os.path.join(path, media_item), 'wb') as media_file: + shutil.copyfileobj(response.raw, media_file) + print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc)) + return media_item #This pages through all the looptober tag and collects the json in 'data' there_is_more = True url = "https://post.lurk.org/api/v1/timelines/tag/looptober" data = [] while there_is_more: - print("downloading", url) - r = requests.get(url) - print(r.status_code) - if r.ok: - if r.content: - - data.append(r.json()) - print(len(data)) - sleep(1) - - if r.links: - url = r.links["next"]["url"] - print("found next url", url) - - else: - print("no more data") - there_is_more = False - break - else: - break + print("downloading", url) + r = requests.get(url) + print("response status: ", r.status_code) + if r.ok: + if r.content: + + data.append(r.json()) + print("amount of pages:", len(data)) + sleep(0.5) + + if r.links: + url = r.links["next"]["url"] + print("found next url", url) + + else: + print("no more data") + there_is_more = False + break + else: + break #this parses all the json, taking a few valuable fields and puts them in looooops looooops = [] for collection in data: - for i in collection: - if i["media_attachments"]: #we only take entries that actually contain a sound file - creation_date = datetime.datetime.fromisoformat( - i['created_at'][:-1]).astimezone( - datetime.timezone.utc) - - if creation_date.strftime('%Y') == "2022": #we only take entries from this year - stuff = {} - stuff["url"] = i["url"] - stuff["description"] = i["content"] - stuff["audio"] = i["media_attachments"] - stuff["date"] = i["created_at"] - stuff["id"] = i["id"] - stuff["creator"] = i["account"]["username"] - looooops.append(stuff) - print("found post by {} with {} looops".format( - i["account"]["username"], - len(i["media_attachments"]))) - - -#for l in looooops: - # create a folder per l, named id - # download the files in media_attachments using the remote_url - # find a way to stuff metadata in the file + for i in collection: + if i["media_attachments"]: #we only take entries that actually contain a sound file + creation_date = datetime.datetime.fromisoformat( + i['created_at'][:-1]).astimezone( + datetime.timezone.utc) + + if creation_date.strftime('%Y') == "2022": #we only take entries from this year + stuff = {} + stuff["url"] = i["url"] + stuff["description"] = i["content"] + stuff["audio"] = i["media_attachments"] + stuff["date"] = i["created_at"] + stuff["id"] = i["id"] + stuff["creator"] = i["account"]["username"] + looooops.append(stuff) + print("found post by {} with {} looops".format( + i["account"]["username"], + len(i["media_attachments"]))) + +if not os.path.exists(output_dir): + os.mkdir(output_dir) + +for l in looooops: + path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id'])) + if not os.path.exists(path): + os.mkdir(path) + + print("\n") + print("Downloading looops by ***{}***".format(l['creator'])) + for a in l['audio']: + if a['remote_url']: + url = a['remote_url'] + else: + url = a['url'] + + grab_media(path, url) +