Browse Source

crawler now scrapes in parallel threads

master
rra 7 years ago
parent
commit
09d76040eb
  1. 106
      fedicrawler.py

106
fedicrawler.py

@ -1,53 +1,77 @@
#!/bin/env python3 #!/bin/env python3
# fediscraper v1 # fediscraper v2
import json, requests,threading import json, requests
from multiprocessing.dummy import Pool as ThreadPool
start_url = 'https://post.lurk.org' start_url = 'https://post.lurk.org'
activity = '' activity = ''
peers_info = '/api/v1/instance/peers'
instance_info = '/api/v1/instance' pool = ThreadPool(128)
def get_peers(instance): def get_peers(instance):
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10) try:
if r.status_code == 200: r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1)
peers = r.json() if r.status_code == 200:
print(instance, 'peers with', len(peers)) peers = r.json()
return(peers) print(instance, 'peers with', len(peers))
else: return peers
print('fail: got {} on {}'.format(r.status_code, instance))
else:
instances = {} print('fail: got {} on {}'.format(r.status_code, instance))
return
except Exception as e:
print('fail on',instance, e)
# print(e)
return
def get_instance_info(instance):
instance_info = '/api/v1/instance'
print('getting info for', instance)
try:
ii = requests.get('https://'+instance+instance_info, timeout=10)
info = ii.json()
print('info request for {} succeeded'.format(instance))
except Exception as e:
print('failed to query instance info')
# print(e)
info = {'error': 'error'}
return info
instances = set([])
r = requests.get(start_url+peers_info) r = requests.get(start_url+peers_info)
if r. status_code == 200: if r. status_code == 200:
print('200 for '+start_url) start_peers = pool.map(get_peers, r.json())
peers = r.json() for i in start_peers:
print('{} has {} peers'.format(start_url, len(peers))) if not i:
for count, peer in enumerate(peers): start_peers.remove(i)
#instances.add(peer) else:
try: pool.map(instances.add, i)
peer_list = get_peers(peer)
if peer_list: network = pool.map(get_peers, instances)
for i in peer_list: for peer_list in network:
if i not in instances: if peer_list:
try: for instance in peer_list:
ii = requests.get('https://'+i+instance_info, timeout=10) instances.add(instance)
info = ii.json()
except Exception as e:
print('failed to query instance info') instance_info = pool.map(get_instance_info, instances)
print(e)
info = 'error' scrape = {}
pass
instances[i] = info instances_list = list(instances)
print('added {}, n={}'.format(i,len(instances)))
else: for count, value in enumerate(instances_list):
print(i.status_code, 'on', peer) scrape[value] = instance_info[count]
except Exception as e:
print('failure for', peer) print('found {} instances'.format(len(scrape)))
# instances[peer] = {'error':e}
print(e) pool.close()
pool.join()
#text = list(filter(None.__ne__, instances))
with open('instance_scrape.json','w') as f: with open('instance_scrape.json','w') as f:
f.write(json.dumps(instances,indent=4)) f.write(json.dumps(instances,indent=4))

Loading…
Cancel
Save