From 65ddc4905730a1b3393b658fa33cb7e1d11af251 Mon Sep 17 00:00:00 2001 From: rra Date: Tue, 5 May 2020 16:28:37 +0200 Subject: [PATCH] working on making the script importable by about_collector --- fedicrawler.py | 91 ++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/fedicrawler.py b/fedicrawler.py index 811cd15..87f1511 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -1,20 +1,8 @@ #!/bin/env python3 -# fedicrawler v4 +# fedicrawler v5 -import json, requests -from multiprocessing.dummy import Pool as ThreadPool from urllib.parse import urlparse -start_url = 'https://post.lurk.org' -activity = '' -peers_info ='/api/v1/instance/peers' - -proxies = { -'http':'socks5://localhost:12345', -'http':'socks5://localhost:12345'} - -pool = ThreadPool(512) - def not_gab(instance): #gab does some weird stuff wrt enumerating subdomains #example: epa1pu1qcxxyzcxher0u.gab.best @@ -117,43 +105,58 @@ def get_instance_info(instance): info = {'error': str(e)} return info -filters = [not_gab, only_netloc] #what to filter out +def fedicrawler(): -instances = set([]) -r = requests.get(start_url+peers_info) # normal brain, initial peer list -if r. status_code == 200: - start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers - for i in start_peers: - if not i: - start_peers.remove(i) - else: - pool.map(instances.add, i) - # for i in r.json(): - # instances.add(i) - instances = set(multi_filter(filters,instances)) # apply filters before we move to network + start_url = 'https://post.lurk.org' + activity = '' + peers_info ='/api/v1/instance/peers' + + proxies = { + 'http':'socks5://localhost:12345', + 'http':'socks5://localhost:12345'} + + pool = ThreadPool(512) + + filters = [not_gab, only_netloc] #what to filter out + + instances = set([]) + r = requests.get(start_url+peers_info) # normal brain, initial peer list + if r. status_code == 200: + start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers + for i in start_peers: + if not i: + start_peers.remove(i) + else: + pool.map(instances.add, i) + # for i in r.json(): + # instances.add(i) + instances = set(multi_filter(filters,instances)) # apply filters before we move to network + + network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers + for peer_list in network: + if peer_list: + for instance in peer_list: + if not_gab(instance): #prevent gab.best subdomain enumeration + instances.add(instance) - network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers - for peer_list in network: - if peer_list: - for instance in peer_list: - if not_gab(instance): #prevent gab.best subdomain enumeration - instances.add(instance) + instances = set(multi_filter(filters,instances)) - instances = set(multi_filter(filters,instances)) + instance_info = pool.map(get_nodeinfo, instances) -instance_info = pool.map(get_nodeinfo, instances) + scrape = {} + instances_list = list(instances) -scrape = {} -instances_list = list(instances) + for count, value in enumerate(instances_list): + scrape[value] = instance_info[count] -for count, value in enumerate(instances_list): - scrape[value] = instance_info[count] + print('found {} instances'.format(len(scrape))) -print('found {} instances'.format(len(scrape))) + pool.close() + pool.join() -pool.close() -pool.join() + with open('instance_scrape.json','w') as f: + f.write(json.dumps(scrape,indent=4)) + #f.write('\n'.join(text) -with open('instance_scrape.json','w') as f: - f.write(json.dumps(scrape,indent=4)) - #f.write('\n'.join(text) +if __name__ == '__main__': + fedicrawler() \ No newline at end of file