doing applied fediverse research
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
2.8 KiB

#!/bin/env python3
# fediscraper v3
import json, requests
from multiprocessing.dummy import Pool as ThreadPool
start_url = 'https://post.lurk.org'
activity = ''
peers_info ='/api/v1/instance/peers'
pool = ThreadPool(512)
def not_gab(instance):
#gab does some weird stuff wrt enumerating subdomains
#example: epa1pu1qcxxyzcxher0u.gab.best
if instance:
if 'gab.best'in instance:
return False
else:
return True
else:
return False
#TODO filter ngrok
def get_peers(instance):
try:
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1)
if r.status_code == 200:
peers = r.json()
print(instance, 'peers with', len(peers))
return peers
else:
print('fail: got {} on {}'.format(r.status_code, instance))
return
except Exception as e:
print('fail on',instance, e)
# print(e)
return
def get_instance_info(instance):
instance_info = '/api/v1/instance'
if not_gab(instance):
print('getting info for', instance)
try:
r = requests.get('https://'+instance+instance_info, timeout=10)
if r.status_code == 200:
info = r.json()
print('info request for {} succeeded'.format(instance))
elif r.status_code == 400:
#try to see if its peertube, probably should use something better
pt = requests.get('https://'+instance+'/api/v1/config')
if r.status_code == 200:
print('info request for {} succeeded, peertube'.format(instance))
info = r.json()
else:
info = {'error': r.status_code}
else:
info = {'error': r.status_code}
except Exception as e:
print('failed to query instance info')
# print(e)
info = {'error': str(e)}
return info
instances = set([])
r = requests.get(start_url+peers_info)
if r. status_code == 200:
start_peers = pool.map(get_peers, r.json())
for i in start_peers:
if not i:
start_peers.remove(i)
else:
pool.map(instances.add, i)
network = pool.map(get_peers, instances)
for peer_list in network:
if peer_list:
for instance in peer_list:
if not_gab(instance):
instances.add(instance)
instance_info = pool.map(get_instance_info, instances)
scrape = {}
instances_list = list(instances)
for count, value in enumerate(instances_list):
scrape[value] = instance_info[count]
print('found {} instances'.format(len(scrape)))
pool.close()
pool.join()
with open('instance_scrape.json','w') as f:
f.write(json.dumps(scrape,indent=4))
#f.write('\n'.join(text)