updated with a way to get around gab.best enumeration and better error logging

This commit is contained in:
rra 2020-04-28 14:30:18 +02:00
parent ce613426d0
commit b4c5d50a77

View File

@ -1,5 +1,5 @@
#!/bin/env python3 #!/bin/env python3
# fediscraper v2 # fediscraper v3
import json, requests import json, requests
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
@ -8,7 +8,19 @@ start_url = 'https://post.lurk.org'
activity = '' activity = ''
peers_info ='/api/v1/instance/peers' peers_info ='/api/v1/instance/peers'
pool = ThreadPool(256) pool = ThreadPool(512)
def not_gab(instance):
#gab does some weird stuff wrt enumerating subdomains
#example: epa1pu1qcxxyzcxher0u.gab.best
if instance:
if 'gab.best'in instance:
return False
else:
return True
else:
return False
#TODO filter ngrok
def get_peers(instance): def get_peers(instance):
@ -18,7 +30,6 @@ def get_peers(instance):
peers = r.json() peers = r.json()
print(instance, 'peers with', len(peers)) print(instance, 'peers with', len(peers))
return peers return peers
else: else:
print('fail: got {} on {}'.format(r.status_code, instance)) print('fail: got {} on {}'.format(r.status_code, instance))
return return
@ -29,16 +40,29 @@ def get_peers(instance):
def get_instance_info(instance): def get_instance_info(instance):
instance_info = '/api/v1/instance' instance_info = '/api/v1/instance'
print('getting info for', instance) if not_gab(instance):
try: print('getting info for', instance)
ii = requests.get('https://'+instance+instance_info, timeout=10) try:
info = ii.json() r = requests.get('https://'+instance+instance_info, timeout=10)
print('info request for {} succeeded'.format(instance)) if r.status_code == 200:
except Exception as e: info = r.json()
print('failed to query instance info') print('info request for {} succeeded'.format(instance))
# print(e) elif r.status_code == 400:
info = {'error': 'error'} #try to see if its peertube, probably should use something better
return info pt = requests.get('https://'+instance+'/api/v1/config')
if r.status_code == 200:
print('info request for {} succeeded, peertube'.format(instance))
info = r.json()
else:
info = {'error': r.status_code}
else:
info = {'error': r.status_code}
except Exception as e:
print('failed to query instance info')
# print(e)
info = {'error': str(e)}
return info
instances = set([]) instances = set([])
@ -55,7 +79,8 @@ if r. status_code == 200:
for peer_list in network: for peer_list in network:
if peer_list: if peer_list:
for instance in peer_list: for instance in peer_list:
instances.add(instance) if not_gab(instance):
instances.add(instance)
instance_info = pool.map(get_instance_info, instances) instance_info = pool.map(get_instance_info, instances)
@ -76,4 +101,4 @@ pool.join()
with open('instance_scrape.json','w') as f: with open('instance_scrape.json','w') as f:
f.write(json.dumps(scrape,indent=4)) f.write(json.dumps(scrape,indent=4))
#f.write('\n'.join(text) #f.write('\n'.join(text)