directorate for applied fediverse research
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

fedicrawler.py 1.9KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/bin/env python3
  2. # fediscraper v2
  3. import json, requests
  4. from multiprocessing.dummy import Pool as ThreadPool
  5. start_url = 'https://post.lurk.org'
  6. activity = ''
  7. peers_info ='/api/v1/instance/peers'
  8. pool = ThreadPool(256)
  9. def get_peers(instance):
  10. try:
  11. r = requests.get('https://'+instance+'/api/v1/instance/peers', timeout=1)
  12. if r.status_code == 200:
  13. peers = r.json()
  14. print(instance, 'peers with', len(peers))
  15. return peers
  16. else:
  17. print('fail: got {} on {}'.format(r.status_code, instance))
  18. return
  19. except Exception as e:
  20. print('fail on',instance, e)
  21. # print(e)
  22. return
  23. def get_instance_info(instance):
  24. instance_info = '/api/v1/instance'
  25. print('getting info for', instance)
  26. try:
  27. ii = requests.get('https://'+instance+instance_info, timeout=10)
  28. info = ii.json()
  29. print('info request for {} succeeded'.format(instance))
  30. except Exception as e:
  31. print('failed to query instance info')
  32. # print(e)
  33. info = {'error': 'error'}
  34. return info
  35. instances = set([])
  36. r = requests.get(start_url+peers_info)
  37. if r. status_code == 200:
  38. start_peers = pool.map(get_peers, r.json())
  39. for i in start_peers:
  40. if not i:
  41. start_peers.remove(i)
  42. else:
  43. pool.map(instances.add, i)
  44. network = pool.map(get_peers, instances)
  45. for peer_list in network:
  46. if peer_list:
  47. for instance in peer_list:
  48. instances.add(instance)
  49. instance_info = pool.map(get_instance_info, instances)
  50. scrape = {}
  51. instances_list = list(instances)
  52. for count, value in enumerate(instances_list):
  53. scrape[value] = instance_info[count]
  54. print('found {} instances'.format(len(scrape)))
  55. pool.close()
  56. pool.join()
  57. with open('instance_scrape.json','w') as f:
  58. f.write(json.dumps(scrape,indent=4))
  59. #f.write('\n'.join(text)