|
@ -3,11 +3,16 @@ |
|
|
|
|
|
|
|
|
import json, requests |
|
|
import json, requests |
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
|
|
|
|
from urllib.parse import urlparse |
|
|
|
|
|
|
|
|
start_url = 'https://post.lurk.org' |
|
|
start_url = 'https://post.lurk.org' |
|
|
activity = '' |
|
|
activity = '' |
|
|
peers_info ='/api/v1/instance/peers' |
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
|
|
|
|
proxies = { |
|
|
|
|
|
'http':'socks5://localhost:12345', |
|
|
|
|
|
'http':'socks5://localhost:12345'} |
|
|
|
|
|
|
|
|
pool = ThreadPool(512) |
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
|
def not_gab(instance): |
|
|
def not_gab(instance): |
|
@ -15,75 +20,125 @@ def not_gab(instance): |
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
|
if instance: |
|
|
if instance: |
|
|
if 'gab.best'in instance: |
|
|
if 'gab.best'in instance: |
|
|
|
|
|
print('GAB', instance) |
|
|
|
|
|
return False |
|
|
|
|
|
elif 'ngrok.io' in instance: |
|
|
|
|
|
print('NGROK', instance) |
|
|
|
|
|
return False |
|
|
|
|
|
elif 'glitch.me' in instance: |
|
|
|
|
|
print('GLITCH', instance) |
|
|
return False |
|
|
return False |
|
|
else: |
|
|
else: |
|
|
return True |
|
|
return True |
|
|
else: |
|
|
else: |
|
|
return False |
|
|
return False |
|
|
#TODO filter ngrok |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def only_netloc(instance): |
|
|
|
|
|
#some peerlists return stuff like |
|
|
|
|
|
#mastodon.social/users/blabla or |
|
|
|
|
|
#domain.tld/friendica which are all invalid |
|
|
|
|
|
return urlparse('https://'+instance).netloc |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def multi_filter(fs, l): |
|
|
|
|
|
# https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/ |
|
|
|
|
|
if not fs: |
|
|
|
|
|
return l |
|
|
|
|
|
return multi_filter(fs[1:], (x for x in l if fs[0](x))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_peers(instance): |
|
|
def get_peers(instance): |
|
|
|
|
|
#this is really from the assumption that combined Mastodon & Pleroma |
|
|
|
|
|
#instances have enough view of entire fediverse to rely only on those |
|
|
try: |
|
|
try: |
|
|
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) |
|
|
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies) |
|
|
if r.status_code == 200: |
|
|
if r.status_code == 200: |
|
|
peers = r.json() |
|
|
peers = r.json() |
|
|
print(instance, 'peers with', len(peers)) |
|
|
print(instance, 'peers with', len(peers)) |
|
|
return peers |
|
|
return peers |
|
|
else: |
|
|
else: |
|
|
print('fail: got {} on {}'.format(r.status_code, instance)) |
|
|
# 404s etc |
|
|
|
|
|
#print('fail: got {} on {}'.format(r.status_code, instance)) |
|
|
return |
|
|
return |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print('fail on',instance, e) |
|
|
#network errors etc |
|
|
# print(e) |
|
|
#print('fail on',instance, e) |
|
|
|
|
|
#print(e) |
|
|
return |
|
|
return |
|
|
|
|
|
|
|
|
def get_instance_info(instance): |
|
|
def get_nodeinfo(instance): |
|
|
instance_info = '/api/v1/instance' |
|
|
nodeinfo_probe = '/.well-known/nodeinfo' |
|
|
if not_gab(instance): |
|
|
try: |
|
|
print('getting info for', instance) |
|
|
r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies) |
|
|
try: |
|
|
if r.status_code == 200: |
|
|
r = requests.get('https://'+instance+instance_info, timeout=10) |
|
|
nodeinfo_endpoint = r.json()['links'][0]['href'] |
|
|
|
|
|
print(nodeinfo_endpoint) |
|
|
|
|
|
r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies) |
|
|
if r.status_code == 200: |
|
|
if r.status_code == 200: |
|
|
info = r.json() |
|
|
info = {'nodeinfo':r.json()} |
|
|
print('info request for {} succeeded'.format(instance)) |
|
|
else: |
|
|
elif r.status_code == 400: |
|
|
info = {'error': r.status_code} |
|
|
#try to see if its peertube, probably should use something better |
|
|
else: |
|
|
pt = requests.get('https://'+instance+'/api/v1/config') |
|
|
info = {'error': r.status_code} |
|
|
if r.status_code == 200: |
|
|
except Exception as e: |
|
|
print('info request for {} succeeded, peertube'.format(instance)) |
|
|
info = {'error': str(e)} |
|
|
info = r.json() |
|
|
#print(e) |
|
|
else: |
|
|
return info |
|
|
info = {'error': r.status_code} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_instance_info(instance): |
|
|
|
|
|
## no longer used but keeping aroudnd for later maybe |
|
|
|
|
|
#instance_info = '/api/v1/instance' |
|
|
|
|
|
instance_info = '/.well-known/nodeinfo' |
|
|
|
|
|
try: |
|
|
|
|
|
r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies) |
|
|
|
|
|
if r.status_code == 200: |
|
|
|
|
|
info = r.json() |
|
|
|
|
|
print('info request for {} succeeded'.format(instance)) |
|
|
|
|
|
elif r.status_code == 400: |
|
|
|
|
|
#try to see if its peertube, probably should use a better method |
|
|
|
|
|
pt = requests.get('https://'+instance+'/api/v1/config') |
|
|
|
|
|
if pt.status_code == 200: |
|
|
|
|
|
print('info request for {} succeeded, peertube'.format(instance)) |
|
|
|
|
|
info = pt.json() |
|
|
else: |
|
|
else: |
|
|
info = {'error': r.status_code} |
|
|
info = {'error': r.status_code} |
|
|
|
|
|
else: |
|
|
|
|
|
#if we get any other http code.. probably needs fixing |
|
|
|
|
|
info = {'error': r.status_code} |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except ConnectionError as e: |
|
|
print('failed to query instance info') |
|
|
info = {'error': 'Connection error: '+str(e)} |
|
|
# print(e) |
|
|
except Exception as e: |
|
|
info = {'error': str(e)} |
|
|
info = {'error': str(e)} |
|
|
return info |
|
|
return info |
|
|
|
|
|
|
|
|
|
|
|
filters = [not_gab, only_netloc] #what to filter out |
|
|
|
|
|
|
|
|
instances = set([]) |
|
|
instances = set([]) |
|
|
r = requests.get(start_url+peers_info) |
|
|
r = requests.get(start_url+peers_info) # normal brain, initial peer list |
|
|
if r. status_code == 200: |
|
|
if r. status_code == 200: |
|
|
start_peers = pool.map(get_peers, r.json()) |
|
|
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers |
|
|
for i in start_peers: |
|
|
for i in start_peers: |
|
|
if not i: |
|
|
if not i: |
|
|
start_peers.remove(i) |
|
|
start_peers.remove(i) |
|
|
else: |
|
|
else: |
|
|
pool.map(instances.add, i) |
|
|
pool.map(instances.add, i) |
|
|
|
|
|
# for i in r.json(): |
|
|
|
|
|
# instances.add(i) |
|
|
|
|
|
instances = set(multi_filter(filters,instances)) # apply filters before we move to network |
|
|
|
|
|
|
|
|
network = pool.map(get_peers, instances) |
|
|
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers |
|
|
for peer_list in network: |
|
|
for peer_list in network: |
|
|
if peer_list: |
|
|
if peer_list: |
|
|
for instance in peer_list: |
|
|
for instance in peer_list: |
|
|
if not_gab(instance): |
|
|
if not_gab(instance): #prevent gab.best subdomain enumeration |
|
|
instances.add(instance) |
|
|
instances.add(instance) |
|
|
|
|
|
|
|
|
|
|
|
instances = set(multi_filter(filters,instances)) |
|
|
|
|
|
|
|
|
instance_info = pool.map(get_instance_info, instances) |
|
|
instance_info = pool.map(get_nodeinfo, instances) |
|
|
|
|
|
|
|
|
scrape = {} |
|
|
scrape = {} |
|
|
|
|
|
|
|
@ -97,8 +152,6 @@ print('found {} instances'.format(len(scrape))) |
|
|
pool.close() |
|
|
pool.close() |
|
|
pool.join() |
|
|
pool.join() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('instance_scrape.json','w') as f: |
|
|
with open('instance_scrape.json','w') as f: |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
#f.write('\n'.join(text) |
|
|
#f.write('\n'.join(text) |
|
|