|
@ -1,20 +1,8 @@ |
|
|
#!/bin/env python3 |
|
|
#!/bin/env python3 |
|
|
# fedicrawler v4 |
|
|
# fedicrawler v5 |
|
|
|
|
|
|
|
|
import json, requests |
|
|
|
|
|
from multiprocessing.dummy import Pool as ThreadPool |
|
|
|
|
|
from urllib.parse import urlparse |
|
|
from urllib.parse import urlparse |
|
|
|
|
|
|
|
|
start_url = 'https://post.lurk.org' |
|
|
|
|
|
activity = '' |
|
|
|
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
|
|
|
|
proxies = { |
|
|
|
|
|
'http':'socks5://localhost:12345', |
|
|
|
|
|
'http':'socks5://localhost:12345'} |
|
|
|
|
|
|
|
|
|
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
|
|
|
|
def not_gab(instance): |
|
|
def not_gab(instance): |
|
|
#gab does some weird stuff wrt enumerating subdomains |
|
|
#gab does some weird stuff wrt enumerating subdomains |
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
|
#example: epa1pu1qcxxyzcxher0u.gab.best |
|
@ -117,6 +105,18 @@ def get_instance_info(instance): |
|
|
info = {'error': str(e)} |
|
|
info = {'error': str(e)} |
|
|
return info |
|
|
return info |
|
|
|
|
|
|
|
|
|
|
|
def fedicrawler(): |
|
|
|
|
|
|
|
|
|
|
|
start_url = 'https://post.lurk.org' |
|
|
|
|
|
activity = '' |
|
|
|
|
|
peers_info ='/api/v1/instance/peers' |
|
|
|
|
|
|
|
|
|
|
|
proxies = { |
|
|
|
|
|
'http':'socks5://localhost:12345', |
|
|
|
|
|
'http':'socks5://localhost:12345'} |
|
|
|
|
|
|
|
|
|
|
|
pool = ThreadPool(512) |
|
|
|
|
|
|
|
|
filters = [not_gab, only_netloc] #what to filter out |
|
|
filters = [not_gab, only_netloc] #what to filter out |
|
|
|
|
|
|
|
|
instances = set([]) |
|
|
instances = set([]) |
|
@ -157,3 +157,6 @@ pool.join() |
|
|
with open('instance_scrape.json','w') as f: |
|
|
with open('instance_scrape.json','w') as f: |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
f.write(json.dumps(scrape,indent=4)) |
|
|
#f.write('\n'.join(text) |
|
|
#f.write('\n'.join(text) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
fedicrawler() |