rra
7 years ago
1 changed files with 65 additions and 41 deletions
@ -1,53 +1,77 @@ |
|||
#!/bin/env python3 |
|||
# fediscraper v1 |
|||
# fediscraper v2 |
|||
|
|||
import json, requests,threading |
|||
import json, requests |
|||
from multiprocessing.dummy import Pool as ThreadPool |
|||
|
|||
start_url = 'https://post.lurk.org' |
|||
activity = '' |
|||
peers_info = '/api/v1/instance/peers' |
|||
instance_info = '/api/v1/instance' |
|||
|
|||
pool = ThreadPool(128) |
|||
|
|||
|
|||
def get_peers(instance): |
|||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10) |
|||
if r.status_code == 200: |
|||
peers = r.json() |
|||
print(instance, 'peers with', len(peers)) |
|||
return(peers) |
|||
else: |
|||
print('fail: got {} on {}'.format(r.status_code, instance)) |
|||
|
|||
instances = {} |
|||
try: |
|||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) |
|||
if r.status_code == 200: |
|||
peers = r.json() |
|||
print(instance, 'peers with', len(peers)) |
|||
return peers |
|||
|
|||
else: |
|||
print('fail: got {} on {}'.format(r.status_code, instance)) |
|||
return |
|||
except Exception as e: |
|||
print('fail on',instance, e) |
|||
# print(e) |
|||
return |
|||
|
|||
def get_instance_info(instance): |
|||
instance_info = '/api/v1/instance' |
|||
print('getting info for', instance) |
|||
try: |
|||
ii = requests.get('https://'+instance+instance_info, timeout=10) |
|||
info = ii.json() |
|||
print('info request for {} succeeded'.format(instance)) |
|||
except Exception as e: |
|||
print('failed to query instance info') |
|||
# print(e) |
|||
info = {'error': 'error'} |
|||
return info |
|||
|
|||
|
|||
instances = set([]) |
|||
r = requests.get(start_url+peers_info) |
|||
if r. status_code == 200: |
|||
print('200 for '+start_url) |
|||
peers = r.json() |
|||
print('{} has {} peers'.format(start_url, len(peers))) |
|||
for count, peer in enumerate(peers): |
|||
#instances.add(peer) |
|||
try: |
|||
peer_list = get_peers(peer) |
|||
if peer_list: |
|||
for i in peer_list: |
|||
if i not in instances: |
|||
try: |
|||
ii = requests.get('https://'+i+instance_info, timeout=10) |
|||
info = ii.json() |
|||
except Exception as e: |
|||
print('failed to query instance info') |
|||
print(e) |
|||
info = 'error' |
|||
pass |
|||
instances[i] = info |
|||
print('added {}, n={}'.format(i,len(instances))) |
|||
else: |
|||
print(i.status_code, 'on', peer) |
|||
except Exception as e: |
|||
print('failure for', peer) |
|||
# instances[peer] = {'error':e} |
|||
print(e) |
|||
|
|||
#text = list(filter(None.__ne__, instances)) |
|||
start_peers = pool.map(get_peers, r.json()) |
|||
for i in start_peers: |
|||
if not i: |
|||
start_peers.remove(i) |
|||
else: |
|||
pool.map(instances.add, i) |
|||
|
|||
network = pool.map(get_peers, instances) |
|||
for peer_list in network: |
|||
if peer_list: |
|||
for instance in peer_list: |
|||
instances.add(instance) |
|||
|
|||
|
|||
instance_info = pool.map(get_instance_info, instances) |
|||
|
|||
scrape = {} |
|||
|
|||
instances_list = list(instances) |
|||
|
|||
for count, value in enumerate(instances_list): |
|||
scrape[value] = instance_info[count] |
|||
|
|||
print('found {} instances'.format(len(scrape))) |
|||
|
|||
pool.close() |
|||
pool.join() |
|||
|
|||
|
|||
|
|||
with open('instance_scrape.json','w') as f: |
|||
f.write(json.dumps(instances,indent=4)) |
|||
|
Loading…
Reference in new issue