now based primarily on nodeinfo2, add socksproxy, filtering out weird stuff
This commit is contained in:
parent
2ab9879118
commit
1512278240
119
fedicrawler.py
119
fedicrawler.py
@ -3,11 +3,16 @@
|
||||
|
||||
import json, requests
|
||||
from multiprocessing.dummy import Pool as ThreadPool
|
||||
from urllib.parse import urlparse
|
||||
|
||||
start_url = 'https://post.lurk.org'
|
||||
activity = ''
|
||||
peers_info ='/api/v1/instance/peers'
|
||||
|
||||
proxies = {
|
||||
'http':'socks5://localhost:12345',
|
||||
'http':'socks5://localhost:12345'}
|
||||
|
||||
pool = ThreadPool(512)
|
||||
|
||||
def not_gab(instance):
|
||||
@ -15,75 +20,125 @@ def not_gab(instance):
|
||||
#example: epa1pu1qcxxyzcxher0u.gab.best
|
||||
if instance:
|
||||
if 'gab.best'in instance:
|
||||
print('GAB', instance)
|
||||
return False
|
||||
elif 'ngrok.io' in instance:
|
||||
print('NGROK', instance)
|
||||
return False
|
||||
elif 'glitch.me' in instance:
|
||||
print('GLITCH', instance)
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
#TODO filter ngrok
|
||||
|
||||
def only_netloc(instance):
|
||||
#some peerlists return stuff like
|
||||
#mastodon.social/users/blabla or
|
||||
#domain.tld/friendica which are all invalid
|
||||
return urlparse('https://'+instance).netloc
|
||||
|
||||
|
||||
def multi_filter(fs, l):
|
||||
# https://www.reddit.com/r/Python/comments/6xefvp/applying_multiple_filters_to_a_list/
|
||||
if not fs:
|
||||
return l
|
||||
return multi_filter(fs[1:], (x for x in l if fs[0](x)))
|
||||
|
||||
|
||||
def get_peers(instance):
|
||||
#this is really from the assumption that combined Mastodon & Pleroma
|
||||
#instances have enough view of entire fediverse to rely only on those
|
||||
try:
|
||||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1)
|
||||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=3, proxies=proxies)
|
||||
if r.status_code == 200:
|
||||
peers = r.json()
|
||||
print(instance, 'peers with', len(peers))
|
||||
return peers
|
||||
else:
|
||||
print('fail: got {} on {}'.format(r.status_code, instance))
|
||||
# 404s etc
|
||||
#print('fail: got {} on {}'.format(r.status_code, instance))
|
||||
return
|
||||
except Exception as e:
|
||||
print('fail on',instance, e)
|
||||
# print(e)
|
||||
#network errors etc
|
||||
#print('fail on',instance, e)
|
||||
#print(e)
|
||||
return
|
||||
|
||||
def get_instance_info(instance):
|
||||
instance_info = '/api/v1/instance'
|
||||
if not_gab(instance):
|
||||
print('getting info for', instance)
|
||||
try:
|
||||
r = requests.get('https://'+instance+instance_info, timeout=10)
|
||||
def get_nodeinfo(instance):
|
||||
nodeinfo_probe = '/.well-known/nodeinfo'
|
||||
try:
|
||||
r = requests.get('https://{}{}'.format(instance, nodeinfo_probe),timeout=3, proxies=proxies)
|
||||
if r.status_code == 200:
|
||||
nodeinfo_endpoint = r.json()['links'][0]['href']
|
||||
print(nodeinfo_endpoint)
|
||||
r = requests.get(nodeinfo_endpoint, timeout=3, proxies=proxies)
|
||||
if r.status_code == 200:
|
||||
info = r.json()
|
||||
print('info request for {} succeeded'.format(instance))
|
||||
elif r.status_code == 400:
|
||||
#try to see if its peertube, probably should use something better
|
||||
pt = requests.get('https://'+instance+'/api/v1/config')
|
||||
if r.status_code == 200:
|
||||
print('info request for {} succeeded, peertube'.format(instance))
|
||||
info = r.json()
|
||||
else:
|
||||
info = {'error': r.status_code}
|
||||
info = {'nodeinfo':r.json()}
|
||||
else:
|
||||
info = {'error': r.status_code}
|
||||
else:
|
||||
info = {'error': r.status_code}
|
||||
except Exception as e:
|
||||
info = {'error': str(e)}
|
||||
#print(e)
|
||||
return info
|
||||
|
||||
except Exception as e:
|
||||
print('failed to query instance info')
|
||||
# print(e)
|
||||
info = {'error': str(e)}
|
||||
return info
|
||||
|
||||
|
||||
def get_instance_info(instance):
|
||||
## no longer used but keeping aroudnd for later maybe
|
||||
#instance_info = '/api/v1/instance'
|
||||
instance_info = '/.well-known/nodeinfo'
|
||||
try:
|
||||
r = requests.get('https://'+instance+instance_info, timeout=10, proxies=proxies)
|
||||
if r.status_code == 200:
|
||||
info = r.json()
|
||||
print('info request for {} succeeded'.format(instance))
|
||||
elif r.status_code == 400:
|
||||
#try to see if its peertube, probably should use a better method
|
||||
pt = requests.get('https://'+instance+'/api/v1/config')
|
||||
if pt.status_code == 200:
|
||||
print('info request for {} succeeded, peertube'.format(instance))
|
||||
info = pt.json()
|
||||
else:
|
||||
info = {'error': r.status_code}
|
||||
else:
|
||||
#if we get any other http code.. probably needs fixing
|
||||
info = {'error': r.status_code}
|
||||
|
||||
except ConnectionError as e:
|
||||
info = {'error': 'Connection error: '+str(e)}
|
||||
except Exception as e:
|
||||
info = {'error': str(e)}
|
||||
return info
|
||||
|
||||
filters = [not_gab, only_netloc] #what to filter out
|
||||
|
||||
instances = set([])
|
||||
r = requests.get(start_url+peers_info)
|
||||
r = requests.get(start_url+peers_info) # normal brain, initial peer list
|
||||
if r. status_code == 200:
|
||||
start_peers = pool.map(get_peers, r.json())
|
||||
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers
|
||||
for i in start_peers:
|
||||
if not i:
|
||||
start_peers.remove(i)
|
||||
else:
|
||||
pool.map(instances.add, i)
|
||||
# for i in r.json():
|
||||
# instances.add(i)
|
||||
instances = set(multi_filter(filters,instances)) # apply filters before we move to network
|
||||
|
||||
network = pool.map(get_peers, instances)
|
||||
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers
|
||||
for peer_list in network:
|
||||
if peer_list:
|
||||
for instance in peer_list:
|
||||
if not_gab(instance):
|
||||
if not_gab(instance): #prevent gab.best subdomain enumeration
|
||||
instances.add(instance)
|
||||
|
||||
instances = set(multi_filter(filters,instances))
|
||||
|
||||
instance_info = pool.map(get_instance_info, instances)
|
||||
instance_info = pool.map(get_nodeinfo, instances)
|
||||
|
||||
scrape = {}
|
||||
|
||||
@ -97,8 +152,6 @@ print('found {} instances'.format(len(scrape)))
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
|
||||
|
||||
with open('instance_scrape.json','w') as f:
|
||||
f.write(json.dumps(scrape,indent=4))
|
||||
#f.write('\n'.join(text)
|
||||
|
Loading…
Reference in New Issue
Block a user