now it actually downloads loops

2 years ago · 9a10f4f97a
1 changed files with 81 additions and 47 deletions
--- a/download_loooooops.py
+++ b/download_loooooops.py
@ -1,62 +1,96 @@
 import requests
 from time import sleep
-
+import datetime
+import os
+from urllib.parse import urlparse
+import shutil

 #def download_media(dir, url):
-#	remote_url
-#	description
+#   remote_url
+#   description
+
+output_dir = "/home/r/Programming/radio-looptober/loops"
+
+def grab_media(path, url, filename):
+
+    media_item = urlparse(url).path.split('/')[-1]

+    headers = {
+    'User-Agent': 'https://git.vvvvvvaria.org/rra/radio-looptober',
+    'From': 'post.lurk.org/@lurk'  # This is another valid field
+    } 
+
+    if not os.path.exists(os.path.join(path, media_item)):
+        response = requests.get(url, headers=headers, stream=True)
+        if response.ok:
+            with open(os.path.join(path, media_item), 'wb') as media_file:
+                shutil.copyfileobj(response.raw, media_file)
+                print('Downloaded media {} from {}'.format(media_item, urlparse(url).netloc))
+                return media_item

 #This pages through all the looptober tag and collects the json in 'data'
 there_is_more = True
 url = "https://post.lurk.org/api/v1/timelines/tag/looptober"
 data = []
 while there_is_more:
-	print("downloading", url)
-	r = requests.get(url)
-	print(r.status_code)
-	if r.ok:
-		if r.content:
-
-			data.append(r.json())
-			print(len(data))
-			sleep(1)
-
-			if r.links:
-				url = r.links["next"]["url"]
-				print("found next url", url)
-
-			else:
-				print("no more data")
-				there_is_more = False
-				break
-	else:
-		break
+    print("downloading", url)
+    r = requests.get(url)
+    print("response status: ", r.status_code)
+    if r.ok:
+        if r.content:
+
+            data.append(r.json())
+            print("amount of pages:", len(data))
+            sleep(0.5)
+
+            if r.links:
+                url = r.links["next"]["url"]
+                print("found next url", url)
+
+            else:
+                print("no more data")
+                there_is_more = False
+                break
+    else:
+        break

 #this parses all the json, taking a few valuable fields and puts them in looooops
 looooops = []
 for collection in data:
-	for i in collection:
-		if i["media_attachments"]: #we only take entries that actually contain a sound file
- 			creation_date = datetime.datetime.fromisoformat(
-				i['created_at'][:-1]).astimezone(
-				datetime.timezone.utc)
-
-			if creation_date.strftime('%Y') == "2022": #we only take entries from this year
-				stuff = {}
-				stuff["url"] = i["url"]
-				stuff["description"] = i["content"]
-				stuff["audio"] = i["media_attachments"]
-				stuff["date"] = i["created_at"]
-				stuff["id"] = i["id"]
-				stuff["creator"] = i["account"]["username"] 
-				looooops.append(stuff)
-				print("found post by {} with {} looops".format(
-					i["account"]["username"],
-					len(i["media_attachments"])))
-
-
-#for l in looooops:
-	# create a folder per l, named id
-	# download the files in media_attachments using the remote_url
-	# find a way to stuff metadata in the file
+    for i in collection:
+        if i["media_attachments"]: #we only take entries that actually contain a sound file
+            creation_date = datetime.datetime.fromisoformat(
+                i['created_at'][:-1]).astimezone(
+                datetime.timezone.utc)
+
+            if creation_date.strftime('%Y') == "2022": #we only take entries from this year
+                stuff = {}
+                stuff["url"] = i["url"]
+                stuff["description"] = i["content"]
+                stuff["audio"] = i["media_attachments"]
+                stuff["date"] = i["created_at"]
+                stuff["id"] = i["id"]
+                stuff["creator"] = i["account"]["username"] 
+                looooops.append(stuff)
+                print("found post by {} with {} looops".format(
+                    i["account"]["username"],
+                    len(i["media_attachments"])))
+
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+
+for l in looooops:
+    path = os.path.join(output_dir,"{}_{}".format(l['creator'], l['id']))
+    if not os.path.exists(path):
+        os.mkdir(path)
+
+    print("\n")
+    print("Downloading looops by ***{}***".format(l['creator']))
+    for a in l['audio']:
+        if a['remote_url']:
+            url = a['remote_url']
+        else: 
+            url = a['url']
+
+        grab_media(path, url)
+