Browse Source

working on making the script importable by about_collector

master
rra 5 years ago
parent
commit
65ddc49057
  1. 91
      fedicrawler.py

91
fedicrawler.py

@ -1,20 +1,8 @@
#!/bin/env python3 #!/bin/env python3
# fedicrawler v4 # fedicrawler v5
import json, requests
from multiprocessing.dummy import Pool as ThreadPool
from urllib.parse import urlparse from urllib.parse import urlparse
start_url = 'https://post.lurk.org'
activity = ''
peers_info ='/api/v1/instance/peers'
proxies = {
'http':'socks5://localhost:12345',
'http':'socks5://localhost:12345'}
pool = ThreadPool(512)
def not_gab(instance): def not_gab(instance):
#gab does some weird stuff wrt enumerating subdomains #gab does some weird stuff wrt enumerating subdomains
#example: epa1pu1qcxxyzcxher0u.gab.best #example: epa1pu1qcxxyzcxher0u.gab.best
@ -117,43 +105,58 @@ def get_instance_info(instance):
info = {'error': str(e)} info = {'error': str(e)}
return info return info
filters = [not_gab, only_netloc] #what to filter out def fedicrawler():
instances = set([]) start_url = 'https://post.lurk.org'
r = requests.get(start_url+peers_info) # normal brain, initial peer list activity = ''
if r. status_code == 200: peers_info ='/api/v1/instance/peers'
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers
for i in start_peers: proxies = {
if not i: 'http':'socks5://localhost:12345',
start_peers.remove(i) 'http':'socks5://localhost:12345'}
else:
pool.map(instances.add, i) pool = ThreadPool(512)
# for i in r.json():
# instances.add(i) filters = [not_gab, only_netloc] #what to filter out
instances = set(multi_filter(filters,instances)) # apply filters before we move to network
instances = set([])
r = requests.get(start_url+peers_info) # normal brain, initial peer list
if r. status_code == 200:
start_peers = pool.map(get_peers, r.json()) #expanding brain, get all peers of those initial peers
for i in start_peers:
if not i:
start_peers.remove(i)
else:
pool.map(instances.add, i)
# for i in r.json():
# instances.add(i)
instances = set(multi_filter(filters,instances)) # apply filters before we move to network
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers
for peer_list in network:
if peer_list:
for instance in peer_list:
if not_gab(instance): #prevent gab.best subdomain enumeration
instances.add(instance)
network = pool.map(get_peers, instances) #galaxy brain, get all peers of all peers of the initial peers instances = set(multi_filter(filters,instances))
for peer_list in network:
if peer_list:
for instance in peer_list:
if not_gab(instance): #prevent gab.best subdomain enumeration
instances.add(instance)
instances = set(multi_filter(filters,instances)) instance_info = pool.map(get_nodeinfo, instances)
instance_info = pool.map(get_nodeinfo, instances) scrape = {}
instances_list = list(instances)
scrape = {} for count, value in enumerate(instances_list):
instances_list = list(instances) scrape[value] = instance_info[count]
for count, value in enumerate(instances_list): print('found {} instances'.format(len(scrape)))
scrape[value] = instance_info[count]
print('found {} instances'.format(len(scrape))) pool.close()
pool.join()
pool.close() with open('instance_scrape.json','w') as f:
pool.join() f.write(json.dumps(scrape,indent=4))
#f.write('\n'.join(text)
with open('instance_scrape.json','w') as f: if __name__ == '__main__':
f.write(json.dumps(scrape,indent=4)) fedicrawler()
#f.write('\n'.join(text)
Loading…
Cancel
Save