updated with a way to get around gab.best enumeration and better error logging
This commit is contained in:
parent
ce613426d0
commit
b4c5d50a77
@ -1,5 +1,5 @@
|
|||||||
#!/bin/env python3
|
#!/bin/env python3
|
||||||
# fediscraper v2
|
# fediscraper v3
|
||||||
|
|
||||||
import json, requests
|
import json, requests
|
||||||
from multiprocessing.dummy import Pool as ThreadPool
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
@ -8,7 +8,19 @@ start_url = 'https://post.lurk.org'
|
|||||||
activity = ''
|
activity = ''
|
||||||
peers_info ='/api/v1/instance/peers'
|
peers_info ='/api/v1/instance/peers'
|
||||||
|
|
||||||
pool = ThreadPool(256)
|
pool = ThreadPool(512)
|
||||||
|
|
||||||
|
def not_gab(instance):
|
||||||
|
#gab does some weird stuff wrt enumerating subdomains
|
||||||
|
#example: epa1pu1qcxxyzcxher0u.gab.best
|
||||||
|
if instance:
|
||||||
|
if 'gab.best'in instance:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
#TODO filter ngrok
|
||||||
|
|
||||||
|
|
||||||
def get_peers(instance):
|
def get_peers(instance):
|
||||||
@ -18,7 +30,6 @@ def get_peers(instance):
|
|||||||
peers = r.json()
|
peers = r.json()
|
||||||
print(instance, 'peers with', len(peers))
|
print(instance, 'peers with', len(peers))
|
||||||
return peers
|
return peers
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print('fail: got {} on {}'.format(r.status_code, instance))
|
print('fail: got {} on {}'.format(r.status_code, instance))
|
||||||
return
|
return
|
||||||
@ -29,16 +40,29 @@ def get_peers(instance):
|
|||||||
|
|
||||||
def get_instance_info(instance):
|
def get_instance_info(instance):
|
||||||
instance_info = '/api/v1/instance'
|
instance_info = '/api/v1/instance'
|
||||||
print('getting info for', instance)
|
if not_gab(instance):
|
||||||
try:
|
print('getting info for', instance)
|
||||||
ii = requests.get('https://'+instance+instance_info, timeout=10)
|
try:
|
||||||
info = ii.json()
|
r = requests.get('https://'+instance+instance_info, timeout=10)
|
||||||
print('info request for {} succeeded'.format(instance))
|
if r.status_code == 200:
|
||||||
except Exception as e:
|
info = r.json()
|
||||||
print('failed to query instance info')
|
print('info request for {} succeeded'.format(instance))
|
||||||
# print(e)
|
elif r.status_code == 400:
|
||||||
info = {'error': 'error'}
|
#try to see if its peertube, probably should use something better
|
||||||
return info
|
pt = requests.get('https://'+instance+'/api/v1/config')
|
||||||
|
if r.status_code == 200:
|
||||||
|
print('info request for {} succeeded, peertube'.format(instance))
|
||||||
|
info = r.json()
|
||||||
|
else:
|
||||||
|
info = {'error': r.status_code}
|
||||||
|
else:
|
||||||
|
info = {'error': r.status_code}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('failed to query instance info')
|
||||||
|
# print(e)
|
||||||
|
info = {'error': str(e)}
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
instances = set([])
|
instances = set([])
|
||||||
@ -55,7 +79,8 @@ if r. status_code == 200:
|
|||||||
for peer_list in network:
|
for peer_list in network:
|
||||||
if peer_list:
|
if peer_list:
|
||||||
for instance in peer_list:
|
for instance in peer_list:
|
||||||
instances.add(instance)
|
if not_gab(instance):
|
||||||
|
instances.add(instance)
|
||||||
|
|
||||||
|
|
||||||
instance_info = pool.map(get_instance_info, instances)
|
instance_info = pool.map(get_instance_info, instances)
|
||||||
@ -76,4 +101,4 @@ pool.join()
|
|||||||
|
|
||||||
with open('instance_scrape.json','w') as f:
|
with open('instance_scrape.json','w') as f:
|
||||||
f.write(json.dumps(scrape,indent=4))
|
f.write(json.dumps(scrape,indent=4))
|
||||||
#f.write('\n'.join(text)
|
#f.write('\n'.join(text)
|
||||||
|
Loading…
Reference in New Issue
Block a user