#!/bin/env python3 # a tool for collecting mastodon /about/ & ToS and CoC pages. # (c) roel roscam abbing 2020 # gplv3 import os, time, json scrape_data = json.loads(open('instance_scrape.json').read()) output_dir = 'about_pages' tos_dir = os.path.join(output_dir,'with_tos') if not os.path.exists(output_dir): os.mkdir(output_dir) if not os.path.exists(tos_dir): os.mkdir(tos_dir) def find_mastodon_instances(scrape_data): mastodon_instances = [] for i in scrape_data: if 'nodeinfo' in scrape_data[i].keys(): if 'software' in scrape_data[i]['nodeinfo'].keys(): if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': mastodon_instances.append(i) return mastodon_instances def clear_cache(driver, timeout=60): """Clear the cookies and cache for the ChromeDriver instance.""" # navigate to the settings page driver.get('chrome://settings/clearBrowserData') # wait for the button to appear wait = WebDriverWait(driver, timeout) wait.until(get_clear_browsing_button) # click the button to clear the cache get_clear_browsing_button(driver).click() # wait for the button to be gone before returning wait.until_not(get_clear_browsing_button) from selenium import webdriver # you need to set up a driver from selenium.webdriver.firefox.options import Options options = Options() options.add_argument('-headless') browser = webdriver.Firefox(options=options) #browser.set_window_size(1024, 768) # set the window size that you need for mi in find_mastodon_instances(scrape_data): #clear_cache(browser) browser.get('https://{}/about/more/'.format(mi)) page = browser.find_element_by_tag_name('html') height=page.size['height'] browser.set_window_size(1400,height+100) try: bw = browser.find_element_by_css_selector('div .box-widget') about_text = bw.find_element_by_css_selector('div .rich-formatting') html = browser.execute_script('return arguments[0].innerHTML;',about_text) if html: output_dir = tos_dir with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: f.write(html) except Exception as e: output_dir = 'about_pages' print(e) time.sleep(0.1) print('taking screenshot of', mi) browser.save_screenshot(os.path.join(output_dir,mi+'.png')) with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: f.write(browser.page_source) #browser.save_screenshot() browser.quit()