added new script to document mastodon about pages

5 years ago · c003a6ae96
2 changed files with 84 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 *.pyc
+about_pages/
+*.log
--- a/about_collector.py
+++ b/about_collector.py
@ -0,0 +1,82 @@
+#!/bin/env python3
+# a tool for collecting mastodon /about/ & ToS and CoC pages.
+# (c) roel roscam abbing 2020
+# gplv3
+import os, time, json
+
+scrape_data = json.loads(open('instance_scrape.json').read())
+
+output_dir = 'about_pages'
+tos_dir = os.path.join(output_dir,'with_tos')
+
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+    if not os.path.exists(tos_dir):
+        os.mkdir(tos_dir)
+
+
+
+def find_mastodon_instances(scrape_data):
+    mastodon_instances = []   
+
+    for i in scrape_data:
+        if 'nodeinfo' in scrape_data[i].keys():
+            if 'software' in scrape_data[i]['nodeinfo'].keys():
+                if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
+                    mastodon_instances.append(i)
+    return mastodon_instances
+
+
+def clear_cache(driver, timeout=60):
+    """Clear the cookies and cache for the ChromeDriver instance."""
+    # navigate to the settings page
+    driver.get('chrome://settings/clearBrowserData')
+
+    # wait for the button to appear
+    wait = WebDriverWait(driver, timeout)
+    wait.until(get_clear_browsing_button)
+
+    # click the button to clear the cache
+    get_clear_browsing_button(driver).click()
+
+    # wait for the button to be gone before returning
+    wait.until_not(get_clear_browsing_button)
+
+
+from selenium import webdriver # you need to set up a driver
+from selenium.webdriver.firefox.options import Options
+
+options = Options()
+options.add_argument('-headless')
+
+browser = webdriver.Firefox(options=options)
+#browser.set_window_size(1024, 768) # set the window size that you need 
+
+for mi in find_mastodon_instances(scrape_data):
+    #clear_cache(browser)
+
+    browser.get('https://{}/about/more/'.format(mi))
+    page = browser.find_element_by_tag_name('html')
+    height=page.size['height']
+    browser.set_window_size(1400,height+100)
+
+
+    try:
+        bw = browser.find_element_by_css_selector('div .box-widget')
+        about_text = bw.find_element_by_css_selector('div .rich-formatting')
+        html = browser.execute_script('return arguments[0].innerHTML;',about_text)
+        if html:
+            output_dir = tos_dir
+        with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
+            f.write(html)
+    except Exception as e:
+        output_dir = 'about_pages'
+        print(e)
+
+    time.sleep(0.1)
+    print('taking screenshot of', mi)
+    browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
+    with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
+        f.write(browser.page_source)
+    #browser.save_screenshot()
+browser.quit()