diff --git a/.gitignore b/.gitignore index 0d20b64..62fdc3d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.pyc +about_pages/ +*.log diff --git a/about_collector.py b/about_collector.py new file mode 100644 index 0000000..b9cabda --- /dev/null +++ b/about_collector.py @@ -0,0 +1,82 @@ +#!/bin/env python3 +# a tool for collecting mastodon /about/ & ToS and CoC pages. +# (c) roel roscam abbing 2020 +# gplv3 +import os, time, json + +scrape_data = json.loads(open('instance_scrape.json').read()) + +output_dir = 'about_pages' +tos_dir = os.path.join(output_dir,'with_tos') + +if not os.path.exists(output_dir): + os.mkdir(output_dir) + if not os.path.exists(tos_dir): + os.mkdir(tos_dir) + + + +def find_mastodon_instances(scrape_data): + mastodon_instances = [] + + for i in scrape_data: + if 'nodeinfo' in scrape_data[i].keys(): + if 'software' in scrape_data[i]['nodeinfo'].keys(): + if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': + mastodon_instances.append(i) + return mastodon_instances + + +def clear_cache(driver, timeout=60): + """Clear the cookies and cache for the ChromeDriver instance.""" + # navigate to the settings page + driver.get('chrome://settings/clearBrowserData') + + # wait for the button to appear + wait = WebDriverWait(driver, timeout) + wait.until(get_clear_browsing_button) + + # click the button to clear the cache + get_clear_browsing_button(driver).click() + + # wait for the button to be gone before returning + wait.until_not(get_clear_browsing_button) + + +from selenium import webdriver # you need to set up a driver +from selenium.webdriver.firefox.options import Options + +options = Options() +options.add_argument('-headless') + +browser = webdriver.Firefox(options=options) +#browser.set_window_size(1024, 768) # set the window size that you need + +for mi in find_mastodon_instances(scrape_data): + #clear_cache(browser) + + browser.get('https://{}/about/more/'.format(mi)) + page = browser.find_element_by_tag_name('html') + height=page.size['height'] + browser.set_window_size(1400,height+100) + + + try: + bw = browser.find_element_by_css_selector('div .box-widget') + about_text = bw.find_element_by_css_selector('div .rich-formatting') + html = browser.execute_script('return arguments[0].innerHTML;',about_text) + if html: + output_dir = tos_dir + with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: + f.write(html) + except Exception as e: + output_dir = 'about_pages' + print(e) + + time.sleep(0.1) + print('taking screenshot of', mi) + browser.save_screenshot(os.path.join(output_dir,mi+'.png')) + with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: + f.write(browser.page_source) + #browser.save_screenshot() +browser.quit()