From 09d76040ebdd4b633efb35bd367671f4234ceb61 Mon Sep 17 00:00:00 2001 From: rra Date: Thu, 7 Jun 2018 23:46:36 +0200 Subject: [PATCH] crawler now scrapes in parallel threads --- fedicrawler.py | 106 ++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/fedicrawler.py b/fedicrawler.py index a6ec2a1..e6ab61c 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -1,53 +1,77 @@ #!/bin/env python3 -# fediscraper v1 +# fediscraper v2 -import json, requests,threading +import json, requests +from multiprocessing.dummy import Pool as ThreadPool start_url = 'https://post.lurk.org' activity = '' -peers_info = '/api/v1/instance/peers' -instance_info = '/api/v1/instance' + +pool = ThreadPool(128) + def get_peers(instance): - r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10) - if r.status_code == 200: - peers = r.json() - print(instance, 'peers with', len(peers)) - return(peers) - else: - print('fail: got {} on {}'.format(r.status_code, instance)) - -instances = {} + try: + r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) + if r.status_code == 200: + peers = r.json() + print(instance, 'peers with', len(peers)) + return peers + + else: + print('fail: got {} on {}'.format(r.status_code, instance)) + return + except Exception as e: + print('fail on',instance, e) + # print(e) + return + +def get_instance_info(instance): + instance_info = '/api/v1/instance' + print('getting info for', instance) + try: + ii = requests.get('https://'+instance+instance_info, timeout=10) + info = ii.json() + print('info request for {} succeeded'.format(instance)) + except Exception as e: + print('failed to query instance info') + # print(e) + info = {'error': 'error'} + return info + + +instances = set([]) r = requests.get(start_url+peers_info) if r. status_code == 200: - print('200 for '+start_url) - peers = r.json() - print('{} has {} peers'.format(start_url, len(peers))) - for count, peer in enumerate(peers): - #instances.add(peer) - try: - peer_list = get_peers(peer) - if peer_list: - for i in peer_list: - if i not in instances: - try: - ii = requests.get('https://'+i+instance_info, timeout=10) - info = ii.json() - except Exception as e: - print('failed to query instance info') - print(e) - info = 'error' - pass - instances[i] = info - print('added {}, n={}'.format(i,len(instances))) - else: - print(i.status_code, 'on', peer) - except Exception as e: - print('failure for', peer) - # instances[peer] = {'error':e} - print(e) - -#text = list(filter(None.__ne__, instances)) + start_peers = pool.map(get_peers, r.json()) + for i in start_peers: + if not i: + start_peers.remove(i) + else: + pool.map(instances.add, i) + + network = pool.map(get_peers, instances) + for peer_list in network: + if peer_list: + for instance in peer_list: + instances.add(instance) + + +instance_info = pool.map(get_instance_info, instances) + +scrape = {} + +instances_list = list(instances) + +for count, value in enumerate(instances_list): + scrape[value] = instance_info[count] + +print('found {} instances'.format(len(scrape))) + +pool.close() +pool.join() + + with open('instance_scrape.json','w') as f: f.write(json.dumps(instances,indent=4))