rra
2 years ago
1 changed files with 81 additions and 47 deletions
@ -1,62 +1,96 @@ |
|||||
import requests |
import requests |
||||
from time import sleep |
from time import sleep |
||||
|
import datetime |
||||
|
import os |
||||
|
from urllib.parse import urlparse |
||||
|
import shutil |
||||
|
|
||||
#def download_media(dir, url): |
#def download_media(dir, url): |
||||
# remote_url |
# remote_url |
||||
# description |
# description |
||||
|
|
||||
|
output_dir = "/home/r/Programming/radio-looptober/loops" |
||||
|
|
||||
|
def grab_media(path, url, filename): |
||||
|
|
||||
|
media_item = urlparse(url).path.split('/')[-1] |
||||
|
|
||||
|
headers = { |
||||
|
'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober', |
||||
|
'From': 'post.lurk.org/@lurk' # This is another valid field |
||||
|
} |
||||
|
|
||||
|
if not os.path.exists(os.path.join(path, media_item)): |
||||
|
response = requests.get(url, headers=headers, stream=True) |
||||
|
if response.ok: |
||||
|
with open(os.path.join(path, media_item), 'wb') as media_file: |
||||
|
shutil.copyfileobj(response.raw, media_file) |
||||
|
print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc)) |
||||
|
return media_item |
||||
|
|
||||
#This pages through all the looptober tag and collects the json in 'data' |
#This pages through all the looptober tag and collects the json in 'data' |
||||
there_is_more = True |
there_is_more = True |
||||
url = "https://post.lurk.org/api/v1/timelines/tag/looptober" |
url = "https://post.lurk.org/api/v1/timelines/tag/looptober" |
||||
data = [] |
data = [] |
||||
while there_is_more: |
while there_is_more: |
||||
print("downloading", url) |
print("downloading", url) |
||||
r = requests.get(url) |
r = requests.get(url) |
||||
print(r.status_code) |
print("response status: ", r.status_code) |
||||
if r.ok: |
if r.ok: |
||||
if r.content: |
if r.content: |
||||
|
|
||||
data.append(r.json()) |
data.append(r.json()) |
||||
print(len(data)) |
print("amount of pages:", len(data)) |
||||
sleep(1) |
sleep(0.5) |
||||
|
|
||||
if r.links: |
if r.links: |
||||
url = r.links["next"]["url"] |
url = r.links["next"]["url"] |
||||
print("found next url", url) |
print("found next url", url) |
||||
|
|
||||
else: |
else: |
||||
print("no more data") |
print("no more data") |
||||
there_is_more = False |
there_is_more = False |
||||
break |
break |
||||
else: |
else: |
||||
break |
break |
||||
|
|
||||
#this parses all the json, taking a few valuable fields and puts them in looooops |
#this parses all the json, taking a few valuable fields and puts them in looooops |
||||
looooops = [] |
looooops = [] |
||||
for collection in data: |
for collection in data: |
||||
for i in collection: |
for i in collection: |
||||
if i["media_attachments"]: #we only take entries that actually contain a sound file |
if i["media_attachments"]: #we only take entries that actually contain a sound file |
||||
creation_date = datetime.datetime.fromisoformat( |
creation_date = datetime.datetime.fromisoformat( |
||||
i['created_at'][:-1]).astimezone( |
i['created_at'][:-1]).astimezone( |
||||
datetime.timezone.utc) |
datetime.timezone.utc) |
||||
|
|
||||
if creation_date.strftime('%Y') == "2022": #we only take entries from this year |
if creation_date.strftime('%Y') == "2022": #we only take entries from this year |
||||
stuff = {} |
stuff = {} |
||||
stuff["url"] = i["url"] |
stuff["url"] = i["url"] |
||||
stuff["description"] = i["content"] |
stuff["description"] = i["content"] |
||||
stuff["audio"] = i["media_attachments"] |
stuff["audio"] = i["media_attachments"] |
||||
stuff["date"] = i["created_at"] |
stuff["date"] = i["created_at"] |
||||
stuff["id"] = i["id"] |
stuff["id"] = i["id"] |
||||
stuff["creator"] = i["account"]["username"] |
stuff["creator"] = i["account"]["username"] |
||||
looooops.append(stuff) |
looooops.append(stuff) |
||||
print("found post by {} with {} looops".format( |
print("found post by {} with {} looops".format( |
||||
i["account"]["username"], |
i["account"]["username"], |
||||
len(i["media_attachments"]))) |
len(i["media_attachments"]))) |
||||
|
|
||||
|
if not os.path.exists(output_dir): |
||||
#for l in looooops: |
os.mkdir(output_dir) |
||||
# create a folder per l, named id |
|
||||
# download the files in media_attachments using the remote_url |
for l in looooops: |
||||
# find a way to stuff metadata in the file |
path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id'])) |
||||
|
if not os.path.exists(path): |
||||
|
os.mkdir(path) |
||||
|
|
||||
|
print("\n") |
||||
|
print("Downloading looops by ***{}***".format(l['creator'])) |
||||
|
for a in l['audio']: |
||||
|
if a['remote_url']: |
||||
|
url = a['remote_url'] |
||||
|
else: |
||||
|
url = a['url'] |
||||
|
|
||||
|
grab_media(path, url) |
||||
|
|
||||
|
Loading…
Reference in new issue