|
|
@ -1,20 +1,8 @@ |
|
|
|
#!/bin/env python3 |
|
|
|
# fedicrawler v4 |
|
|
|
# fedicrawler v5 |
|
|
|
|
|
|
|
import json, requests |
|
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
|
|
from urllib.parse import urlparse |
|
|
|
|
|
|
|
start_url = 'https://post.lurk.org' |
|
|
|
activity = '' |
|
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
proxies = { |
|
|
|
'http':'socks5://localhost:12345', |
|
|
|
'http':'socks5://localhost:12345'} |
|
|
|
|
|
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
def not_gab(instance): |
|
|
|
#gab does some weird stuff wrt enumerating subdomains |
|
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
|
@ -117,43 +105,58 @@ def get_instance_info(instance): |
|
|
|
info = {'error': str(e)} |
|
|
|
return info |
|
|
|
|
|
|
|
filters = [not_gab, only_netloc] #what to filter out |
|
|
|
def fedicrawler(): |
|
|
|
|
|
|
|
instances = set([]) |
|
|
|
r = requests.get(start_url+peers_info) # normal brain, initial peer list |
|
|
|
if r. status_code == 200: |
|
|
|
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers |
|
|
|
for i in start_peers: |
|
|
|
if not i: |
|
|
|
start_peers.remove(i) |
|
|
|
else: |
|
|
|
pool.map(instances.add, i) |
|
|
|
# for i in r.json(): |
|
|
|
# instances.add(i) |
|
|
|
instances = set(multi_filter(filters,instances)) # apply filters before we move to network |
|
|
|
start_url = 'https://post.lurk.org' |
|
|
|
activity = '' |
|
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
proxies = { |
|
|
|
'http':'socks5://localhost:12345', |
|
|
|
'http':'socks5://localhost:12345'} |
|
|
|
|
|
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
filters = [not_gab, only_netloc] #what to filter out |
|
|
|
|
|
|
|
instances = set([]) |
|
|
|
r = requests.get(start_url+peers_info) # normal brain, initial peer list |
|
|
|
if r. status_code == 200: |
|
|
|
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers |
|
|
|
for i in start_peers: |
|
|
|
if not i: |
|
|
|
start_peers.remove(i) |
|
|
|
else: |
|
|
|
pool.map(instances.add, i) |
|
|
|
# for i in r.json(): |
|
|
|
# instances.add(i) |
|
|
|
instances = set(multi_filter(filters,instances)) # apply filters before we move to network |
|
|
|
|
|
|
|
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers |
|
|
|
for peer_list in network: |
|
|
|
if peer_list: |
|
|
|
for instance in peer_list: |
|
|
|
if not_gab(instance): #prevent gab.best subdomain enumeration |
|
|
|
instances.add(instance) |
|
|
|
|
|
|
|
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers |
|
|
|
for peer_list in network: |
|
|
|
if peer_list: |
|
|
|
for instance in peer_list: |
|
|
|
if not_gab(instance): #prevent gab.best subdomain enumeration |
|
|
|
instances.add(instance) |
|
|
|
instances = set(multi_filter(filters,instances)) |
|
|
|
|
|
|
|
instances = set(multi_filter(filters,instances)) |
|
|
|
instance_info = pool.map(get_nodeinfo, instances) |
|
|
|
|
|
|
|
instance_info = pool.map(get_nodeinfo, instances) |
|
|
|
scrape = {} |
|
|
|
instances_list = list(instances) |
|
|
|
|
|
|
|
scrape = {} |
|
|
|
instances_list = list(instances) |
|
|
|
for count, value in enumerate(instances_list): |
|
|
|
scrape[value] = instance_info[count] |
|
|
|
|
|
|
|
for count, value in enumerate(instances_list): |
|
|
|
scrape[value] = instance_info[count] |
|
|
|
print('found {} instances'.format(len(scrape))) |
|
|
|
|
|
|
|
print('found {} instances'.format(len(scrape))) |
|
|
|
pool.close() |
|
|
|
pool.join() |
|
|
|
|
|
|
|
pool.close() |
|
|
|
pool.join() |
|
|
|
with open('instance_scrape.json','w') as f: |
|
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
|
#f.write('\n'.join(text) |
|
|
|
|
|
|
|
with open('instance_scrape.json','w') as f: |
|
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
|
#f.write('\n'.join(text) |
|
|
|
if __name__ == '__main__': |
|
|
|
fedicrawler() |