first version, crawls only the announced peers

7 years ago · abbb8a6dd7
2 changed files with 6338 additions and 0 deletions
--- a/fedicrawler.py
+++ b/fedicrawler.py
@ -0,0 +1,38 @@
 #!/bin/env python3
 # fediscraper v1
 import json, requests
 start_url = 'https://post.lurk.org'
 activity = ''
 peers_info = '/api/v1/instance/peers'
 instance_info = '/api/v1/instance'
 instances = {set([])}
 r = requests.get(start_url+peers_info)
 if r. status_code == 200:
 	print('200 for '+start_url)
 	peers = r.json()
 	print('{} has {} peers'.format(start_url, len(peers)))
 	for count, peer in enumerate(peers):
 			instances.add(peer)
 			try:
 				r = requests.get('https://'+peer+peers_info, timeout=10)
 				if r.status_code == 200:
 					print(peer, 'peers with', len(r.json()))
 					for i in r.json():
 						if i not in instances:
 							instances.add(i)
 							print('added {}, n={}'.format(i,len(instances)))
 				else:
 					print(i.status_code, 'on', peer)
 			except Exception as e:
 				print('failure for', peer)
 				# instances[peer] = {'error':e}
 				print(e)
 text = list(filter(None.__ne__, instances)) 
 with open('instance_scrape.txt','w') as f:
 	f.write('\n'.join(text))
--- a/instances.txt
+++ b/instances.txt