radio-looptober-lurk/download_loooooops.py

#!/bin/env python3

import requests
from time import sleep
import datetime
import os
from urllib.parse import urlparse
import shutil

#def download_media(dir, url):
#   remote_url
#   description

output_dir = "/home/r/Programming/radio-looptober/loops"

def grab_media(path, url, filename):

    media_item = urlparse(url).path.split('/')[-1]

    headers = {
    'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',
    'From': 'post.lurk.org/@lurk'  # This is another valid field
    } 

    if not os.path.exists(os.path.join(path, media_item)):
        response = requests.get(url, headers=headers, stream=True)
        if response.ok:
            with open(os.path.join(path, media_item), 'wb') as media_file:
                shutil.copyfileobj(response.raw, media_file)
                print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))
                return media_item

#This pages through all the looptober tag and collects the json in 'data'
there_is_more = True
url = "https://post.lurk.org/api/v1/timelines/tag/looptober"
data = []
while there_is_more:
    print("downloading", url)
    r = requests.get(url)
    print("response status: ", r.status_code)
    if r.ok:
        if r.content:

            data.append(r.json())
            print("amount of pages:", len(data))
            sleep(0.5)

            if r.links:
                url = r.links["next"]["url"]
                print("found next url", url)

            else:
                print("no more data")
                there_is_more = False
                break
    else:
        break

#this parses all the json, taking a few valuable fields and puts them in looooops
looooops = []
for collection in data:
    for i in collection:
        if i["media_attachments"]: #we only take entries that actually contain a sound file
            creation_date = datetime.datetime.fromisoformat(
                i['created_at'][:-1]).astimezone(
                datetime.timezone.utc)

            if creation_date.strftime('%Y') == "2022": #we only take entries from this year
                stuff = {}
                stuff["url"] = i["url"]
                stuff["description"] = i["content"]
                stuff["audio"] = i["media_attachments"]
                stuff["date"] = i["created_at"]
                stuff["id"] = i["id"]
                stuff["creator"] = i["account"]["username"] 
                looooops.append(stuff)
                print("found post by {} with {} looops".format(
                    i["account"]["username"],
                    len(i["media_attachments"])))

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for l in looooops:
    path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))
    if not os.path.exists(path):
        os.mkdir(path)

    print("\n")
    print("Downloading looops by ***{}***".format(l['creator']))
    for a in l['audio']:
        if a['remote_url']:
            url = a['remote_url']
        else: 
            url = a['url']

        grab_media(path, url)
py3 shebang 2 years ago			`#!/bin/env python3`

initial commit, grabs all possible loops 2 years ago			`import requests`
			`from time import sleep`
now it actually downloads loops 2 years ago			`import datetime`
			`import os`
			`from urllib.parse import urlparse`
			`import shutil`
initial commit, grabs all possible loops 2 years ago
			`#def download_media(dir, url):`
now it actually downloads loops 2 years ago			`# remote_url`
			`# description`

			`output_dir = "/home/r/Programming/radio-looptober/loops"`

			`def grab_media(path, url, filename):`

			`media_item = urlparse(url).path.split('/')[-1]`
initial commit, grabs all possible loops 2 years ago
now it actually downloads loops 2 years ago			`headers = {`
			`'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',`
			`'From': 'post.lurk.org/@lurk' # This is another valid field`
			`}`

			`if not os.path.exists(os.path.join(path, media_item)):`
			`response = requests.get(url, headers=headers, stream=True)`
			`if response.ok:`
			`with open(os.path.join(path, media_item), 'wb') as media_file:`
			`shutil.copyfileobj(response.raw, media_file)`
			`print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))`
			`return media_item`
initial commit, grabs all possible loops 2 years ago
			`#This pages through all the looptober tag and collects the json in 'data'`
			`there_is_more = True`
			`url = "https://post.lurk.org/api/v1/timelines/tag/looptober"`
			`data = []`
			`while there_is_more:`
now it actually downloads loops 2 years ago			`print("downloading", url)`
			`r = requests.get(url)`
			`print("response status: ", r.status_code)`
			`if r.ok:`
			`if r.content:`

			`data.append(r.json())`
			`print("amount of pages:", len(data))`
			`sleep(0.5)`

			`if r.links:`
			`url = r.links["next"]["url"]`
			`print("found next url", url)`

			`else:`
			`print("no more data")`
			`there_is_more = False`
			`break`
			`else:`
			`break`
initial commit, grabs all possible loops 2 years ago
			`#this parses all the json, taking a few valuable fields and puts them in looooops`
			`looooops = []`
			`for collection in data:`
now it actually downloads loops 2 years ago			`for i in collection:`
			`if i["media_attachments"]: #we only take entries that actually contain a sound file`
			`creation_date = datetime.datetime.fromisoformat(`
			`i['created_at'][:-1]).astimezone(`
			`datetime.timezone.utc)`

			`if creation_date.strftime('%Y') == "2022": #we only take entries from this year`
			`stuff = {}`
			`stuff["url"] = i["url"]`
			`stuff["description"] = i["content"]`
			`stuff["audio"] = i["media_attachments"]`
			`stuff["date"] = i["created_at"]`
			`stuff["id"] = i["id"]`
			`stuff["creator"] = i["account"]["username"]`
			`looooops.append(stuff)`
			`print("found post by {} with {} looops".format(`
			`i["account"]["username"],`
			`len(i["media_attachments"])))`

			`if not os.path.exists(output_dir):`
			`os.mkdir(output_dir)`

			`for l in looooops:`
			`path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))`
			`if not os.path.exists(path):`
			`os.mkdir(path)`

			`print("\n")`
			`print("Downloading looops by *{}*".format(l['creator']))`
			`for a in l['audio']:`
			`if a['remote_url']:`
			`url = a['remote_url']`
			`else:`
			`url = a['url']`

			`grab_media(path, url)`