dafr/fedicrawler.py

#!/bin/env python3
# fedicrawler v4

import json, requests
from multiprocessing.dummy import Pool as ThreadPool 
from urllib.parse import urlparse

start_url = 'https://post.lurk.org'
activity = ''
peers_info ='/api/v1/instance/peers'

proxies = {
'http':'socks5://localhost:12345',
'http':'socks5://localhost:12345'}

pool = ThreadPool(512)

def not_gab(instance):
    #gab does some weird stuff wrt enumerating subdomains
    #example: epa1pu1qcxxyzcxher0u.gab.best
    if instance:
        if 'gab.best'in instance:
            print('GAB', instance)
            return False
        # the ones below are mostly used for testing apps 
        elif 'ngrok.io' in instance:
            print('NGROK', instance)
            return False
        elif 'glitch.me' in instance:
            print('GLITCH', instance)
            return False
        elif 'netlify.app' in instance:
            print('NETLIFY', instance)
        else:
            return True
    else:
        return False

def only_netloc(instance):
    #some peerlists return stuff like
    #mastodon.social/users/blabla or
    #domain.tld/friendica which are all invalid
    return urlparse('https://'+instance).netloc


def multi_filter(fs, l):
    # https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/
    if not fs:
        return l
    return multi_filter(fs[1:], (x for x in l if fs[0](x)))
    

def get_peers(instance):
    #this is really from the assumption that combined Mastodon & Pleroma 
    #instances have enough view of entire fediverse to rely only on those
    try:
        r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies)
        if r.status_code == 200:
            peers = r.json()
            print(instance, 'peers with', len(peers))
            return peers
        else:
            # 404s etc
            #print('fail: got {} on {}'.format(r.status_code, instance))
            return
    except Exception as e:
        #network errors etc
        #print('fail on',instance, e)
        #print(e)
        return

def get_nodeinfo(instance):
    nodeinfo_probe = '/.well-known/nodeinfo'
    try:
        r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies)
        if r.status_code == 200:
            nodeinfo_endpoint = r.json()['links'][0]['href']
            print(nodeinfo_endpoint)
            r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies)
            if r.status_code == 200:
                info = {'nodeinfo':r.json()}
            else:
                info = {'error': r.status_code}
        else:
           info = {'error': r.status_code}
    except Exception as e:
        info = {'error': str(e)}
        #print(e)
    return info


def get_instance_info(instance):
    ## no longer used but keeping aroudnd for later maybe
    #instance_info = '/api/v1/instance'
    instance_info = '/.well-known/nodeinfo'
    try:
        r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies)
        if r.status_code == 200:
            info = r.json()
            print('info request for {} succeeded'.format(instance))
        elif r.status_code == 400:
            #try to see if its peertube, probably should use a better method
            pt = requests.get('https://'+instance+'/api/v1/config')
            if pt.status_code == 200:
                print('info request for {} succeeded, peertube'.format(instance))
                info = pt.json()
            else:
                info = {'error': r.status_code}
        else:
            #if we get any other http code.. probably needs fixing
            info = {'error': r.status_code}

    except ConnectionError as e:
        info = {'error': 'Connection error: '+str(e)}
    except Exception as e:
        info = {'error': str(e)}
    return info

filters = [not_gab, only_netloc] #what to filter out

instances = set([])
r = requests.get(start_url+peers_info) # normal brain, initial peer list
if r. status_code == 200:
    start_peers = pool.map(get_peers, r.json())  #expanding brain, get all peers of those initial peers
    for i in start_peers:
        if not i:
            start_peers.remove(i)
        else:
            pool.map(instances.add, i)
    # for i in r.json():
    #     instances.add(i)
    instances = set(multi_filter(filters,instances)) # apply filters before we move to network

    network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers
    for peer_list in network:
        if peer_list:
            for instance in peer_list:
                if not_gab(instance): #prevent gab.best subdomain enumeration
                    instances.add(instance)

    instances = set(multi_filter(filters,instances))

instance_info = pool.map(get_nodeinfo, instances)

scrape = {}
instances_list = list(instances)

for count, value in enumerate(instances_list):
    scrape[value] = instance_info[count]

print('found {} instances'.format(len(scrape)))

pool.close()
pool.join()

with open('instance_scrape.json','w') as f:
    f.write(json.dumps(scrape,indent=4))
    #f.write('\n'.join(text)
first version, crawls only the announced peers 7 years ago			`#!/bin/env python3`
minor tweaks to fedicrawler & scrape 05-05-2020 5 years ago			`# fedicrawler v4`
first version, crawls only the announced peers 7 years ago
crawler now scrapes in parallel threads 7 years ago			`import json, requests`
			`from multiprocessing.dummy import Pool as ThreadPool`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`from urllib.parse import urlparse`
first version, crawls only the announced peers 7 years ago
			`start_url = 'https://post.lurk.org'`
			`activity = ''`
small fixes 7 years ago			`peers_info ='/api/v1/instance/peers'`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`proxies = {`
			`'http':'socks5://localhost:12345',`
			`'http':'socks5://localhost:12345'}`

updated with a way to get around gab.best enumeration and better error logging 5 years ago			`pool = ThreadPool(512)`

			`def not_gab(instance):`
			`#gab does some weird stuff wrt enumerating subdomains`
			`#example: epa1pu1qcxxyzcxher0u.gab.best`
			`if instance:`
			`if 'gab.best'in instance:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`print('GAB', instance)`
			`return False`
minor tweaks to fedicrawler & scrape 05-05-2020 5 years ago			`# the ones below are mostly used for testing apps`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`elif 'ngrok.io' in instance:`
			`print('NGROK', instance)`
			`return False`
			`elif 'glitch.me' in instance:`
			`print('GLITCH', instance)`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`return False`
minor tweaks to fedicrawler & scrape 05-05-2020 5 years ago			`elif 'netlify.app' in instance:`
			`print('NETLIFY', instance)`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`else:`
			`return True`
			`else:`
			`return False`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`def only_netloc(instance):`
			`#some peerlists return stuff like`
			`#mastodon.social/users/blabla or`
			`#domain.tld/friendica which are all invalid`
			`return urlparse('https://'+instance).netloc`


			`def multi_filter(fs, l):`
			`# https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/`
			`if not fs:`
			`return l`
			`return multi_filter(fs[1:], (x for x in l if fs[0](x)))`

first version, crawls only the announced peers 7 years ago
crawler now looks for instance metadata, started to abstract collection into functions 7 years ago			`def get_peers(instance):`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`#this is really from the assumption that combined Mastodon & Pleroma`
			`#instances have enough view of entire fediverse to rely only on those`
crawler now scrapes in parallel threads 7 years ago			`try:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies)`
crawler now scrapes in parallel threads 7 years ago			`if r.status_code == 200:`
			`peers = r.json()`
			`print(instance, 'peers with', len(peers))`
			`return peers`
			`else:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`# 404s etc`
			`#print('fail: got {} on {}'.format(r.status_code, instance))`
crawler now scrapes in parallel threads 7 years ago			`return`
			`except Exception as e:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`#network errors etc`
			`#print('fail on',instance, e)`
			`#print(e)`
crawler now scrapes in parallel threads 7 years ago			`return`

now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`def get_nodeinfo(instance):`
			`nodeinfo_probe = '/.well-known/nodeinfo'`
			`try:`
			`r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies)`
			`if r.status_code == 200:`
			`nodeinfo_endpoint = r.json()['links'][0]['href']`
			`print(nodeinfo_endpoint)`
			`r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies)`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`if r.status_code == 200:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`info = {'nodeinfo':r.json()}`
			`else:`
			`info = {'error': r.status_code}`
			`else:`
			`info = {'error': r.status_code}`
			`except Exception as e:`
			`info = {'error': str(e)}`
			`#print(e)`
			`return info`



			`def get_instance_info(instance):`
			`## no longer used but keeping aroudnd for later maybe`
			`#instance_info = '/api/v1/instance'`
			`instance_info = '/.well-known/nodeinfo'`
			`try:`
			`r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies)`
			`if r.status_code == 200:`
			`info = r.json()`
			`print('info request for {} succeeded'.format(instance))`
			`elif r.status_code == 400:`
			`#try to see if its peertube, probably should use a better method`
			`pt = requests.get('https://'+instance+'/api/v1/config')`
			`if pt.status_code == 200:`
			`print('info request for {} succeeded, peertube'.format(instance))`
			`info = pt.json()`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`else:`
			`info = {'error': r.status_code}`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`else:`
			`#if we get any other http code.. probably needs fixing`
			`info = {'error': r.status_code}`
updated with a way to get around gab.best enumeration and better error logging 5 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`except ConnectionError as e:`
			`info = {'error': 'Connection error: '+str(e)}`
			`except Exception as e:`
			`info = {'error': str(e)}`
			`return info`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`filters = [not_gab, only_netloc] #what to filter out`
crawler now scrapes in parallel threads 7 years ago
			`instances = set([])`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`r = requests.get(start_url+peers_info) # normal brain, initial peer list`
first version, crawls only the announced peers 7 years ago			`if r. status_code == 200:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers`
crawler now scrapes in parallel threads 7 years ago			`for i in start_peers:`
			`if not i:`
			`start_peers.remove(i)`
			`else:`
			`pool.map(instances.add, i)`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`# for i in r.json():`
			`# instances.add(i)`
			`instances = set(multi_filter(filters,instances)) # apply filters before we move to network`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers`
crawler now scrapes in parallel threads 7 years ago			`for peer_list in network:`
			`if peer_list:`
			`for instance in peer_list:`
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`if not_gab(instance): #prevent gab.best subdomain enumeration`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`instances.add(instance)`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`instances = set(multi_filter(filters,instances))`
crawler now scrapes in parallel threads 7 years ago
now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff 5 years ago			`instance_info = pool.map(get_nodeinfo, instances)`
crawler now scrapes in parallel threads 7 years ago
			`scrape = {}`
			`instances_list = list(instances)`

			`for count, value in enumerate(instances_list):`
			`scrape[value] = instance_info[count]`

			`print('found {} instances'.format(len(scrape)))`

			`pool.close()`
			`pool.join()`

crawler now looks for instance metadata, started to abstract collection into functions 7 years ago			`with open('instance_scrape.json','w') as f:`
small fixes 7 years ago			`f.write(json.dumps(scrape,indent=4))`
updated with a way to get around gab.best enumeration and better error logging 5 years ago			`#f.write('\n'.join(text)`