doing applied fediverse research
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

75 lines
2.5 KiB

#!/usr/bin/env python3
# a tool for collecting mastodon /about/ & ToS and CoC pages.
# (c) roel roscam abbing 2020
# gplv3
import os, time, json
from fedicrawler import only_netloc, multi_filter
filters = [only_netloc]
scrape_data = json.loads(open('instance_scrape.json').read())
output_dir = 'about_pages'
tos_dir = os.path.join(output_dir,'with_tos')
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if not os.path.exists(tos_dir):
os.mkdir(tos_dir)
def find_mastodon_instances(scrape_data):
mastodon_instances = []
for i in scrape_data:
if 'nodeinfo' in scrape_data[i].keys():
if 'software' in scrape_data[i]['nodeinfo'].keys():
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
mastodon_instances.append(i)
mastodon_instances = list(multi_filter(filters,mastodon_instances))
return mastodon_instances
from selenium import webdriver # you need to set up a driver
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(options=options)
for mi in find_mastodon_instances(scrape_data):
if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')):
try:
print('🡓', mi)
browser.get('https://{}/about/more/'.format(mi))
page = browser.find_element_by_tag_name('html')
height=page.size['height']
browser.set_window_size(1400,height+100)
try:
bw = browser.find_element_by_css_selector('div .box-widget')
about_text = bw.find_element_by_css_selector('div .rich-formatting')
html = browser.execute_script('return arguments[0].innerHTML;',about_text)
if html:
output_dir = tos_dir
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
f.write(html)
except Exception as e:
output_dir = 'about_pages'
print(e)
time.sleep(0.1)
print('📷', mi)
browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
f.write(browser.page_source)
print(mi,'')
except Exception as e:
print(e)
else:
print(mi, '')
#browser.save_screenshot()
browser.quit()