|
@ -1,5 +1,5 @@ |
|
|
#!/bin/env python3 |
|
|
#!/bin/env python3 |
|
|
# fediscraper v2 |
|
|
# fediscraper v3 |
|
|
|
|
|
|
|
|
import json, requests |
|
|
import json, requests |
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
@ -8,7 +8,19 @@ start_url = 'https://post.lurk.org' |
|
|
activity = '' |
|
|
activity = '' |
|
|
peers_info ='/api/v1/instance/peers' |
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
|
pool = ThreadPool(256) |
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
|
|
|
|
def not_gab(instance): |
|
|
|
|
|
#gab does some weird stuff wrt enumerating subdomains |
|
|
|
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
|
|
|
|
if instance: |
|
|
|
|
|
if 'gab.best'in instance: |
|
|
|
|
|
return False |
|
|
|
|
|
else: |
|
|
|
|
|
return True |
|
|
|
|
|
else: |
|
|
|
|
|
return False |
|
|
|
|
|
#TODO filter ngrok |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_peers(instance): |
|
|
def get_peers(instance): |
|
@ -18,7 +30,6 @@ def get_peers(instance): |
|
|
peers = r.json() |
|
|
peers = r.json() |
|
|
print(instance, 'peers with', len(peers)) |
|
|
print(instance, 'peers with', len(peers)) |
|
|
return peers |
|
|
return peers |
|
|
|
|
|
|
|
|
else: |
|
|
else: |
|
|
print('fail: got {} on {}'.format(r.status_code, instance)) |
|
|
print('fail: got {} on {}'.format(r.status_code, instance)) |
|
|
return |
|
|
return |
|
@ -29,16 +40,29 @@ def get_peers(instance): |
|
|
|
|
|
|
|
|
def get_instance_info(instance): |
|
|
def get_instance_info(instance): |
|
|
instance_info = '/api/v1/instance' |
|
|
instance_info = '/api/v1/instance' |
|
|
print('getting info for', instance) |
|
|
if not_gab(instance): |
|
|
try: |
|
|
print('getting info for', instance) |
|
|
ii = requests.get('https://'+instance+instance_info, timeout=10) |
|
|
try: |
|
|
info = ii.json() |
|
|
r = requests.get('https://'+instance+instance_info, timeout=10) |
|
|
print('info request for {} succeeded'.format(instance)) |
|
|
if r.status_code == 200: |
|
|
except Exception as e: |
|
|
info = r.json() |
|
|
print('failed to query instance info') |
|
|
print('info request for {} succeeded'.format(instance)) |
|
|
# print(e) |
|
|
elif r.status_code == 400: |
|
|
info = {'error': 'error'} |
|
|
#try to see if its peertube, probably should use something better |
|
|
return info |
|
|
pt = requests.get('https://'+instance+'/api/v1/config') |
|
|
|
|
|
if r.status_code == 200: |
|
|
|
|
|
print('info request for {} succeeded, peertube'.format(instance)) |
|
|
|
|
|
info = r.json() |
|
|
|
|
|
else: |
|
|
|
|
|
info = {'error': r.status_code} |
|
|
|
|
|
else: |
|
|
|
|
|
info = {'error': r.status_code} |
|
|
|
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print('failed to query instance info') |
|
|
|
|
|
# print(e) |
|
|
|
|
|
info = {'error': str(e)} |
|
|
|
|
|
return info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
instances = set([]) |
|
|
instances = set([]) |
|
@ -55,7 +79,8 @@ if r. status_code == 200: |
|
|
for peer_list in network: |
|
|
for peer_list in network: |
|
|
if peer_list: |
|
|
if peer_list: |
|
|
for instance in peer_list: |
|
|
for instance in peer_list: |
|
|
instances.add(instance) |
|
|
if not_gab(instance): |
|
|
|
|
|
instances.add(instance) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
instance_info = pool.map(get_instance_info, instances) |
|
|
instance_info = pool.map(get_instance_info, instances) |
|
@ -76,4 +101,4 @@ pool.join() |
|
|
|
|
|
|
|
|
with open('instance_scrape.json','w') as f: |
|
|
with open('instance_scrape.json','w') as f: |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
#f.write('\n'.join(text) |
|
|
#f.write('\n'.join(text) |
|
|