From 4cac59e445b976b12908037eb643386f09662268 Mon Sep 17 00:00:00 2001 From: rra Date: Wed, 30 May 2018 10:27:02 +0200 Subject: [PATCH] crawler now looks for instance metadata, started to abstract collection into functions --- fedicrawler.py | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/fedicrawler.py b/fedicrawler.py index 8c311e0..983cb89 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -1,30 +1,45 @@ #!/bin/env python3 # fediscraper v1 -import json, requests - +import json, requests,threading start_url = 'https://post.lurk.org' activity = '' peers_info = '/api/v1/instance/peers' instance_info = '/api/v1/instance' -instances = {set([])} +def get_peers(instance): + r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10) + if r.status_code == 200: + peers = r.json() + print(instance, 'peers with', len(peers)) + return(peers) + else: + print('fail: got {} on {}'.format(r.status_code, instance)) + +instances = {} r = requests.get(start_url+peers_info) if r. status_code == 200: print('200 for '+start_url) peers = r.json() print('{} has {} peers'.format(start_url, len(peers))) for count, peer in enumerate(peers): - instances.add(peer) + #instances.add(peer) try: - r = requests.get('https://'+peer+peers_info, timeout=10) - if r.status_code == 200: - print(peer, 'peers with', len(r.json())) - for i in r.json(): + peer_list = get_peers(peer) + if peer_list: + for i in peer_list: if i not in instances: - instances.add(i) - print('added {}, n={}'.format(i,len(instances))) + try: + ii = requests.get('https://'+i+instance_info, timeout=10) + info = ii.json() + except Exception as e: + print('failed to query instance info') + print(e) + info = 'error' + pass + instances[i] = info + print('added {}, n={}'.format(i,len(instances))) else: print(i.status_code, 'on', peer) except Exception as e: @@ -32,7 +47,8 @@ if r. status_code == 200: # instances[peer] = {'error':e} print(e) -text = list(filter(None.__ne__, instances)) +#text = list(filter(None.__ne__, instances)) -with open('instance_scrape.txt','w') as f: - f.write('\n'.join(text)) \ No newline at end of file +with open('instance_scrape.json','w') as f: + json.dumps(instances,f) + #f.write('\n'.join(text) \ No newline at end of file