Browse Source

working on making the script importable by about_collector

master
rra 5 years ago
parent
commit
65ddc49057
  1. 29
      fedicrawler.py

29
fedicrawler.py

@ -1,20 +1,8 @@
#!/bin/env python3 #!/bin/env python3
# fedicrawler v4 # fedicrawler v5
import json, requests
from multiprocessing.dummy import Pool as ThreadPool
from urllib.parse import urlparse from urllib.parse import urlparse
start_url = 'https://post.lurk.org'
activity = ''
peers_info ='/api/v1/instance/peers'
proxies = {
'http':'socks5://localhost:12345',
'http':'socks5://localhost:12345'}
pool = ThreadPool(512)
def not_gab(instance): def not_gab(instance):
#gab does some weird stuff wrt enumerating subdomains #gab does some weird stuff wrt enumerating subdomains
#example: epa1pu1qcxxyzcxher0u.gab.best #example: epa1pu1qcxxyzcxher0u.gab.best
@ -117,6 +105,18 @@ def get_instance_info(instance):
info = {'error': str(e)} info = {'error': str(e)}
return info return info
def fedicrawler():
start_url = 'https://post.lurk.org'
activity = ''
peers_info ='/api/v1/instance/peers'
proxies = {
'http':'socks5://localhost:12345',
'http':'socks5://localhost:12345'}
pool = ThreadPool(512)
filters = [not_gab, only_netloc] #what to filter out filters = [not_gab, only_netloc] #what to filter out
instances = set([]) instances = set([])
@ -157,3 +157,6 @@ pool.join()
with open('instance_scrape.json','w') as f: with open('instance_scrape.json','w') as f:
f.write(json.dumps(scrape,indent=4)) f.write(json.dumps(scrape,indent=4))
#f.write('\n'.join(text) #f.write('\n'.join(text)
if __name__ == '__main__':
fedicrawler()
Loading…
Cancel
Save