Browse Source

now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff

master
rra 5 years ago
parent
commit
1512278240
  1. 119
      fedicrawler.py

119
fedicrawler.py

@ -3,11 +3,16 @@
import json, requests import json, requests
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from urllib.parse import urlparse
start_url = 'https://post.lurk.org' start_url = 'https://post.lurk.org'
activity = '' activity = ''
peers_info ='/api/v1/instance/peers' peers_info ='/api/v1/instance/peers'
proxies = {
'http':'socks5://localhost:12345',
'http':'socks5://localhost:12345'}
pool = ThreadPool(512) pool = ThreadPool(512)
def not_gab(instance): def not_gab(instance):
@ -15,75 +20,125 @@ def not_gab(instance):
#example: epa1pu1qcxxyzcxher0u.gab.best #example: epa1pu1qcxxyzcxher0u.gab.best
if instance: if instance:
if 'gab.best'in instance: if 'gab.best'in instance:
print('GAB', instance)
return False
elif 'ngrok.io' in instance:
print('NGROK', instance)
return False
elif 'glitch.me' in instance:
print('GLITCH', instance)
return False return False
else: else:
return True return True
else: else:
return False return False
#TODO filter ngrok
def only_netloc(instance):
#some peerlists return stuff like
#mastodon.social/users/blabla or
#domain.tld/friendica which are all invalid
return urlparse('https://'+instance).netloc
def multi_filter(fs, l):
# https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/
if not fs:
return l
return multi_filter(fs[1:], (x for x in l if fs[0](x)))
def get_peers(instance): def get_peers(instance):
#this is really from the assumption that combined Mastodon & Pleroma
#instances have enough view of entire fediverse to rely only on those
try: try:
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies)
if r.status_code == 200: if r.status_code == 200:
peers = r.json() peers = r.json()
print(instance, 'peers with', len(peers)) print(instance, 'peers with', len(peers))
return peers return peers
else: else:
print('fail: got {} on {}'.format(r.status_code, instance)) # 404s etc
#print('fail: got {} on {}'.format(r.status_code, instance))
return return
except Exception as e: except Exception as e:
print('fail on',instance, e) #network errors etc
# print(e) #print('fail on',instance, e)
#print(e)
return return
def get_instance_info(instance): def get_nodeinfo(instance):
instance_info = '/api/v1/instance' nodeinfo_probe = '/.well-known/nodeinfo'
if not_gab(instance): try:
print('getting info for', instance) r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies)
try: if r.status_code == 200:
r = requests.get('https://'+instance+instance_info, timeout=10) nodeinfo_endpoint = r.json()['links'][0]['href']
print(nodeinfo_endpoint)
r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies)
if r.status_code == 200: if r.status_code == 200:
info = r.json() info = {'nodeinfo':r.json()}
print('info request for {} succeeded'.format(instance)) else:
elif r.status_code == 400: info = {'error': r.status_code}
#try to see if its peertube, probably should use something better else:
pt = requests.get('https://'+instance+'/api/v1/config') info = {'error': r.status_code}
if r.status_code == 200: except Exception as e:
print('info request for {} succeeded, peertube'.format(instance)) info = {'error': str(e)}
info = r.json() #print(e)
else: return info
info = {'error': r.status_code}
def get_instance_info(instance):
## no longer used but keeping aroudnd for later maybe
#instance_info = '/api/v1/instance'
instance_info = '/.well-known/nodeinfo'
try:
r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies)
if r.status_code == 200:
info = r.json()
print('info request for {} succeeded'.format(instance))
elif r.status_code == 400:
#try to see if its peertube, probably should use a better method
pt = requests.get('https://'+instance+'/api/v1/config')
if pt.status_code == 200:
print('info request for {} succeeded, peertube'.format(instance))
info = pt.json()
else: else:
info = {'error': r.status_code} info = {'error': r.status_code}
else:
#if we get any other http code.. probably needs fixing
info = {'error': r.status_code}
except Exception as e: except ConnectionError as e:
print('failed to query instance info') info = {'error': 'Connection error: '+str(e)}
# print(e) except Exception as e:
info = {'error': str(e)} info = {'error': str(e)}
return info return info
filters = [not_gab, only_netloc] #what to filter out
instances = set([]) instances = set([])
r = requests.get(start_url+peers_info) r = requests.get(start_url+peers_info) # normal brain, initial peer list
if r. status_code == 200: if r. status_code == 200:
start_peers = pool.map(get_peers, r.json()) start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers
for i in start_peers: for i in start_peers:
if not i: if not i:
start_peers.remove(i) start_peers.remove(i)
else: else:
pool.map(instances.add, i) pool.map(instances.add, i)
# for i in r.json():
# instances.add(i)
instances = set(multi_filter(filters,instances)) # apply filters before we move to network
network = pool.map(get_peers, instances) network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers
for peer_list in network: for peer_list in network:
if peer_list: if peer_list:
for instance in peer_list: for instance in peer_list:
if not_gab(instance): if not_gab(instance): #prevent gab.best subdomain enumeration
instances.add(instance) instances.add(instance)
instances = set(multi_filter(filters,instances))
instance_info = pool.map(get_instance_info, instances) instance_info = pool.map(get_nodeinfo, instances)
scrape = {} scrape = {}
@ -97,8 +152,6 @@ print('found {} instances'.format(len(scrape)))
pool.close() pool.close()
pool.join() pool.join()
with open('instance_scrape.json','w') as f: with open('instance_scrape.json','w') as f:
f.write(json.dumps(scrape,indent=4)) f.write(json.dumps(scrape,indent=4))
#f.write('\n'.join(text) #f.write('\n'.join(text)

Loading…
Cancel
Save