first version, crawls only the announced peers

2018-05-30 08:20:46 +02:00 · 2018-05-30 08:20:46 +02:00 · abbb8a6dd7
commit abbb8a6dd7
2 changed files with 6338 additions and 0 deletions
--- a/fedicrawler.py
+++ b/fedicrawler.py
@ -0,0 +1,38 @@
+#!/bin/env python3
+# fediscraper v1
+
+import json, requests
+
+
+start_url = 'https://post.lurk.org'
+activity = ''
+peers_info = '/api/v1/instance/peers'
+instance_info = '/api/v1/instance'
+
+instances = {set([])}
+r = requests.get(start_url+peers_info)
+if r. status_code == 200:
+	print('200 for '+start_url)
+	peers = r.json()
+	print('{} has {} peers'.format(start_url, len(peers)))
+	for count, peer in enumerate(peers):
+			instances.add(peer)
+			try:
+				r = requests.get('https://'+peer+peers_info, timeout=10)
+				if r.status_code == 200:
+					print(peer, 'peers with', len(r.json()))
+					for i in r.json():
+						if i not in instances:
+							instances.add(i)
+							print('added {}, n={}'.format(i,len(instances)))
+				else:
+					print(i.status_code, 'on', peer)
+			except Exception as e:
+				print('failure for', peer)
+				# instances[peer] = {'error':e}
+				print(e)
+
+text = list(filter(None.__ne__, instances)) 
+
+with open('instance_scrape.txt','w') as f:
+	f.write('\n'.join(text))
--- a/instances.txt
+++ b/instances.txt