rra
7 years ago
1 changed files with 65 additions and 41 deletions
@ -1,53 +1,77 @@ |
|||||
#!/bin/env python3 |
#!/bin/env python3 |
||||
# fediscraper v1 |
# fediscraper v2 |
||||
|
|
||||
import json, requests,threading |
import json, requests |
||||
|
from multiprocessing.dummy import Pool as ThreadPool |
||||
|
|
||||
start_url = 'https://post.lurk.org' |
start_url = 'https://post.lurk.org' |
||||
activity = '' |
activity = '' |
||||
peers_info = '/api/v1/instance/peers' |
|
||||
instance_info = '/api/v1/instance' |
pool = ThreadPool(128) |
||||
|
|
||||
|
|
||||
def get_peers(instance): |
def get_peers(instance): |
||||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10) |
try: |
||||
if r.status_code == 200: |
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1) |
||||
peers = r.json() |
if r.status_code == 200: |
||||
print(instance, 'peers with', len(peers)) |
peers = r.json() |
||||
return(peers) |
print(instance, 'peers with', len(peers)) |
||||
else: |
return peers |
||||
print('fail: got {} on {}'.format(r.status_code, instance)) |
|
||||
|
else: |
||||
instances = {} |
print('fail: got {} on {}'.format(r.status_code, instance)) |
||||
|
return |
||||
|
except Exception as e: |
||||
|
print('fail on',instance, e) |
||||
|
# print(e) |
||||
|
return |
||||
|
|
||||
|
def get_instance_info(instance): |
||||
|
instance_info = '/api/v1/instance' |
||||
|
print('getting info for', instance) |
||||
|
try: |
||||
|
ii = requests.get('https://'+instance+instance_info, timeout=10) |
||||
|
info = ii.json() |
||||
|
print('info request for {} succeeded'.format(instance)) |
||||
|
except Exception as e: |
||||
|
print('failed to query instance info') |
||||
|
# print(e) |
||||
|
info = {'error': 'error'} |
||||
|
return info |
||||
|
|
||||
|
|
||||
|
instances = set([]) |
||||
r = requests.get(start_url+peers_info) |
r = requests.get(start_url+peers_info) |
||||
if r. status_code == 200: |
if r. status_code == 200: |
||||
print('200 for '+start_url) |
start_peers = pool.map(get_peers, r.json()) |
||||
peers = r.json() |
for i in start_peers: |
||||
print('{} has {} peers'.format(start_url, len(peers))) |
if not i: |
||||
for count, peer in enumerate(peers): |
start_peers.remove(i) |
||||
#instances.add(peer) |
else: |
||||
try: |
pool.map(instances.add, i) |
||||
peer_list = get_peers(peer) |
|
||||
if peer_list: |
network = pool.map(get_peers, instances) |
||||
for i in peer_list: |
for peer_list in network: |
||||
if i not in instances: |
if peer_list: |
||||
try: |
for instance in peer_list: |
||||
ii = requests.get('https://'+i+instance_info, timeout=10) |
instances.add(instance) |
||||
info = ii.json() |
|
||||
except Exception as e: |
|
||||
print('failed to query instance info') |
instance_info = pool.map(get_instance_info, instances) |
||||
print(e) |
|
||||
info = 'error' |
scrape = {} |
||||
pass |
|
||||
instances[i] = info |
instances_list = list(instances) |
||||
print('added {}, n={}'.format(i,len(instances))) |
|
||||
else: |
for count, value in enumerate(instances_list): |
||||
print(i.status_code, 'on', peer) |
scrape[value] = instance_info[count] |
||||
except Exception as e: |
|
||||
print('failure for', peer) |
print('found {} instances'.format(len(scrape))) |
||||
# instances[peer] = {'error':e} |
|
||||
print(e) |
pool.close() |
||||
|
pool.join() |
||||
#text = list(filter(None.__ne__, instances)) |
|
||||
|
|
||||
|
|
||||
with open('instance_scrape.json','w') as f: |
with open('instance_scrape.json','w') as f: |
||||
f.write(json.dumps(instances,indent=4)) |
f.write(json.dumps(instances,indent=4)) |
||||
|
Loading…
Reference in new issue