#!/bin/env python3 # fedicrawler v4 import json, requests from multiprocessing.dummy import Pool as ThreadPool from urllib.parse import urlparse start_url = 'https://post.lurk.org' activity = '' peers_info ='/api/v1/instance/peers' proxies = { 'http':'socks5://localhost:12345', 'http':'socks5://localhost:12345'} pool = ThreadPool(512) def not_gab(instance): #gab does some weird stuff wrt enumerating subdomains #example: epa1pu1qcxxyzcxher0u.gab.best if instance: if 'gab.best'in instance: print('GAB', instance) return False # the ones below are mostly used for testing apps elif 'ngrok.io' in instance: print('NGROK', instance) return False elif 'glitch.me' in instance: print('GLITCH', instance) return False elif 'netlify.app' in instance: print('NETLIFY', instance) else: return True else: return False def only_netloc(instance): #some peerlists return stuff like #mastodon.social/users/blabla or #domain.tld/friendica which are all invalid return urlparse('https://'+instance).netloc def multi_filter(fs, l): # https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/ if not fs: return l return multi_filter(fs[1:], (x for x in l if fs[0](x))) def get_peers(instance): #this is really from the assumption that combined Mastodon & Pleroma #instances have enough view of entire fediverse to rely only on those try: r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies) if r.status_code == 200: peers = r.json() print(instance, 'peers with', len(peers)) return peers else: # 404s etc #print('fail: got {} on {}'.format(r.status_code, instance)) return except Exception as e: #network errors etc #print('fail on',instance, e) #print(e) return def get_nodeinfo(instance): nodeinfo_probe = '/.well-known/nodeinfo' try: r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies) if r.status_code == 200: nodeinfo_endpoint = r.json()['links'][0]['href'] print(nodeinfo_endpoint) r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies) if r.status_code == 200: info = {'nodeinfo':r.json()} else: info = {'error': r.status_code} else: info = {'error': r.status_code} except Exception as e: info = {'error': str(e)} #print(e) return info def get_instance_info(instance): ## no longer used but keeping aroudnd for later maybe #instance_info = '/api/v1/instance' instance_info = '/.well-known/nodeinfo' try: r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies) if r.status_code == 200: info = r.json() print('info request for {} succeeded'.format(instance)) elif r.status_code == 400: #try to see if its peertube, probably should use a better method pt = requests.get('https://'+instance+'/api/v1/config') if pt.status_code == 200: print('info request for {} succeeded, peertube'.format(instance)) info = pt.json() else: info = {'error': r.status_code} else: #if we get any other http code.. probably needs fixing info = {'error': r.status_code} except ConnectionError as e: info = {'error': 'Connection error: '+str(e)} except Exception as e: info = {'error': str(e)} return info filters = [not_gab, only_netloc] #what to filter out instances = set([]) r = requests.get(start_url+peers_info) # normal brain, initial peer list if r. status_code == 200: start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers for i in start_peers: if not i: start_peers.remove(i) else: pool.map(instances.add, i) # for i in r.json(): # instances.add(i) instances = set(multi_filter(filters,instances)) # apply filters before we move to network network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers for peer_list in network: if peer_list: for instance in peer_list: if not_gab(instance): #prevent gab.best subdomain enumeration instances.add(instance) instances = set(multi_filter(filters,instances)) instance_info = pool.map(get_nodeinfo, instances) scrape = {} instances_list = list(instances) for count, value in enumerate(instances_list): scrape[value] = instance_info[count] print('found {} instances'.format(len(scrape))) pool.close() pool.join() with open('instance_scrape.json','w') as f: f.write(json.dumps(scrape,indent=4)) #f.write('\n'.join(text)