diff --git a/fedicrawler.py b/fedicrawler.py index 710cbc8..6aabf08 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -3,11 +3,16 @@ import json, requests from multiprocessing.dummy import Pool as ThreadPool +from urllib.parse import urlparse start_url = 'https://post.lurk.org' activity = '' peers_info ='/api/v1/instance/peers' +proxies = { +'http':'socks5://localhost:12345', +'http':'socks5://localhost:12345'} + pool = ThreadPool(512) def not_gab(instance): @@ -15,75 +20,125 @@ def not_gab(instance): #example: epa1pu1qcxxyzcxher0u.gab.best if instance: if 'gab.best'in instance: + print('GAB', instance) + return False + elif 'ngrok.io' in instance: + print('NGROK', instance) + return False + elif 'glitch.me' in instance: + print('GLITCH', instance) return False else: return True else: return False - #TODO filter ngrok +def only_netloc(instance): + #some peerlists return stuff like + #mastodon.social/users/blabla or + #domain.tld/friendica which are all invalid + return urlparse('https://'+instance).netloc + + +def multi_filter(fs, l): + # https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/ + if not fs: + return l + return multi_filter(fs[1:], (x for x in l if fs[0](x))) + def get_peers(instance): + #this is really from the assumption that combined Mastodon & Pleroma + #instances have enough view of entire fediverse to rely only on those try: - r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) + r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies) if r.status_code == 200: peers = r.json() print(instance, 'peers with', len(peers)) return peers else: - print('fail: got {} on {}'.format(r.status_code, instance)) + # 404s etc + #print('fail: got {} on {}'.format(r.status_code, instance)) return except Exception as e: - print('fail on',instance, e) - # print(e) + #network errors etc + #print('fail on',instance, e) + #print(e) return -def get_instance_info(instance): - instance_info = '/api/v1/instance' - if not_gab(instance): - print('getting info for', instance) - try: - r = requests.get('https://'+instance+instance_info, timeout=10) +def get_nodeinfo(instance): + nodeinfo_probe = '/.well-known/nodeinfo' + try: + r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies) + if r.status_code == 200: + nodeinfo_endpoint = r.json()['links'][0]['href'] + print(nodeinfo_endpoint) + r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies) if r.status_code == 200: - info = r.json() - print('info request for {} succeeded'.format(instance)) - elif r.status_code == 400: - #try to see if its peertube, probably should use something better - pt = requests.get('https://'+instance+'/api/v1/config') - if r.status_code == 200: - print('info request for {} succeeded, peertube'.format(instance)) - info = r.json() - else: - info = {'error': r.status_code} + info = {'nodeinfo':r.json()} + else: + info = {'error': r.status_code} + else: + info = {'error': r.status_code} + except Exception as e: + info = {'error': str(e)} + #print(e) + return info + + + +def get_instance_info(instance): + ## no longer used but keeping aroudnd for later maybe + #instance_info = '/api/v1/instance' + instance_info = '/.well-known/nodeinfo' + try: + r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies) + if r.status_code == 200: + info = r.json() + print('info request for {} succeeded'.format(instance)) + elif r.status_code == 400: + #try to see if its peertube, probably should use a better method + pt = requests.get('https://'+instance+'/api/v1/config') + if pt.status_code == 200: + print('info request for {} succeeded, peertube'.format(instance)) + info = pt.json() else: info = {'error': r.status_code} + else: + #if we get any other http code.. probably needs fixing + info = {'error': r.status_code} - except Exception as e: - print('failed to query instance info') - # print(e) - info = {'error': str(e)} - return info + except ConnectionError as e: + info = {'error': 'Connection error: '+str(e)} + except Exception as e: + info = {'error': str(e)} + return info +filters = [not_gab, only_netloc] #what to filter out instances = set([]) -r = requests.get(start_url+peers_info) +r = requests.get(start_url+peers_info) # normal brain, initial peer list if r. status_code == 200: - start_peers = pool.map(get_peers, r.json()) + start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers for i in start_peers: if not i: start_peers.remove(i) else: pool.map(instances.add, i) + # for i in r.json(): + # instances.add(i) + instances = set(multi_filter(filters,instances)) # apply filters before we move to network - network = pool.map(get_peers, instances) + network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers for peer_list in network: if peer_list: for instance in peer_list: - if not_gab(instance): + if not_gab(instance): #prevent gab.best subdomain enumeration instances.add(instance) + instances = set(multi_filter(filters,instances)) -instance_info = pool.map(get_instance_info, instances) +instance_info = pool.map(get_nodeinfo, instances) scrape = {} @@ -97,8 +152,6 @@ print('found {} instances'.format(len(scrape))) pool.close() pool.join() - - with open('instance_scrape.json','w') as f: f.write(json.dumps(scrape,indent=4)) #f.write('\n'.join(text)