crawler now scrapes in parallel threads
This commit is contained in:
parent
776ac11b52
commit
09d76040eb
102
fedicrawler.py
102
fedicrawler.py
@ -1,53 +1,77 @@
|
|||||||
#!/bin/env python3
|
#!/bin/env python3
|
||||||
# fediscraper v1
|
# fediscraper v2
|
||||||
|
|
||||||
import json, requests,threading
|
import json, requests
|
||||||
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
|
|
||||||
start_url = 'https://post.lurk.org'
|
start_url = 'https://post.lurk.org'
|
||||||
activity = ''
|
activity = ''
|
||||||
peers_info = '/api/v1/instance/peers'
|
|
||||||
instance_info = '/api/v1/instance'
|
pool = ThreadPool(128)
|
||||||
|
|
||||||
|
|
||||||
def get_peers(instance):
|
def get_peers(instance):
|
||||||
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=10)
|
try:
|
||||||
if r.status_code == 200:
|
r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1)
|
||||||
peers = r.json()
|
if r.status_code == 200:
|
||||||
print(instance, 'peers with', len(peers))
|
peers = r.json()
|
||||||
return(peers)
|
print(instance, 'peers with', len(peers))
|
||||||
else:
|
return peers
|
||||||
print('fail: got {} on {}'.format(r.status_code, instance))
|
|
||||||
|
else:
|
||||||
|
print('fail: got {} on {}'.format(r.status_code, instance))
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
print('fail on',instance, e)
|
||||||
|
# print(e)
|
||||||
|
return
|
||||||
|
|
||||||
instances = {}
|
def get_instance_info(instance):
|
||||||
|
instance_info = '/api/v1/instance'
|
||||||
|
print('getting info for', instance)
|
||||||
|
try:
|
||||||
|
ii = requests.get('https://'+instance+instance_info, timeout=10)
|
||||||
|
info = ii.json()
|
||||||
|
print('info request for {} succeeded'.format(instance))
|
||||||
|
except Exception as e:
|
||||||
|
print('failed to query instance info')
|
||||||
|
# print(e)
|
||||||
|
info = {'error': 'error'}
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
instances = set([])
|
||||||
r = requests.get(start_url+peers_info)
|
r = requests.get(start_url+peers_info)
|
||||||
if r. status_code == 200:
|
if r. status_code == 200:
|
||||||
print('200 for '+start_url)
|
start_peers = pool.map(get_peers, r.json())
|
||||||
peers = r.json()
|
for i in start_peers:
|
||||||
print('{} has {} peers'.format(start_url, len(peers)))
|
if not i:
|
||||||
for count, peer in enumerate(peers):
|
start_peers.remove(i)
|
||||||
#instances.add(peer)
|
else:
|
||||||
try:
|
pool.map(instances.add, i)
|
||||||
peer_list = get_peers(peer)
|
|
||||||
if peer_list:
|
network = pool.map(get_peers, instances)
|
||||||
for i in peer_list:
|
for peer_list in network:
|
||||||
if i not in instances:
|
if peer_list:
|
||||||
try:
|
for instance in peer_list:
|
||||||
ii = requests.get('https://'+i+instance_info, timeout=10)
|
instances.add(instance)
|
||||||
info = ii.json()
|
|
||||||
except Exception as e:
|
|
||||||
print('failed to query instance info')
|
instance_info = pool.map(get_instance_info, instances)
|
||||||
print(e)
|
|
||||||
info = 'error'
|
scrape = {}
|
||||||
pass
|
|
||||||
instances[i] = info
|
instances_list = list(instances)
|
||||||
print('added {}, n={}'.format(i,len(instances)))
|
|
||||||
else:
|
for count, value in enumerate(instances_list):
|
||||||
print(i.status_code, 'on', peer)
|
scrape[value] = instance_info[count]
|
||||||
except Exception as e:
|
|
||||||
print('failure for', peer)
|
print('found {} instances'.format(len(scrape)))
|
||||||
# instances[peer] = {'error':e}
|
|
||||||
print(e)
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
|
||||||
|
|
||||||
#text = list(filter(None.__ne__, instances))
|
|
||||||
|
|
||||||
with open('instance_scrape.json','w') as f:
|
with open('instance_scrape.json','w') as f:
|
||||||
f.write(json.dumps(instances,indent=4))
|
f.write(json.dumps(instances,indent=4))
|
||||||
|
Loading…
Reference in New Issue
Block a user