From b4c5d50a77f22c1d273b86ce6d3f77ef30906ded Mon Sep 17 00:00:00 2001 From: rra Date: Tue, 28 Apr 2020 14:30:18 +0200 Subject: [PATCH] updated with a way to get around gab.best enumeration and better error logging --- fedicrawler.py | 55 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/fedicrawler.py b/fedicrawler.py index 1838d36..710cbc8 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -1,5 +1,5 @@ #!/bin/env python3 -# fediscraper v2 +# fediscraper v3 import json, requests from multiprocessing.dummy import Pool as ThreadPool @@ -8,7 +8,19 @@ start_url = 'https://post.lurk.org' activity = '' peers_info ='/api/v1/instance/peers' -pool = ThreadPool(256) +pool = ThreadPool(512) + +def not_gab(instance): + #gab does some weird stuff wrt enumerating subdomains + #example: epa1pu1qcxxyzcxher0u.gab.best + if instance: + if 'gab.best'in instance: + return False + else: + return True + else: + return False + #TODO filter ngrok def get_peers(instance): @@ -18,7 +30,6 @@ def get_peers(instance): peers = r.json() print(instance, 'peers with', len(peers)) return peers - else: print('fail: got {} on {}'.format(r.status_code, instance)) return @@ -29,16 +40,29 @@ def get_peers(instance): def get_instance_info(instance): instance_info = '/api/v1/instance' - print('getting info for', instance) - try: - ii = requests.get('https://'+instance+instance_info, timeout=10) - info = ii.json() - print('info request for {} succeeded'.format(instance)) - except Exception as e: - print('failed to query instance info') - # print(e) - info = {'error': 'error'} - return info + if not_gab(instance): + print('getting info for', instance) + try: + r = requests.get('https://'+instance+instance_info, timeout=10) + if r.status_code == 200: + info = r.json() + print('info request for {} succeeded'.format(instance)) + elif r.status_code == 400: + #try to see if its peertube, probably should use something better + pt = requests.get('https://'+instance+'/api/v1/config') + if r.status_code == 200: + print('info request for {} succeeded, peertube'.format(instance)) + info = r.json() + else: + info = {'error': r.status_code} + else: + info = {'error': r.status_code} + + except Exception as e: + print('failed to query instance info') + # print(e) + info = {'error': str(e)} + return info instances = set([]) @@ -55,7 +79,8 @@ if r. status_code == 200: for peer_list in network: if peer_list: for instance in peer_list: - instances.add(instance) + if not_gab(instance): + instances.add(instance) instance_info = pool.map(get_instance_info, instances) @@ -76,4 +101,4 @@ pool.join() with open('instance_scrape.json','w') as f: f.write(json.dumps(scrape,indent=4)) - #f.write('\n'.join(text) \ No newline at end of file + #f.write('\n'.join(text)