now continues where it left off last time

2020-05-05 16:28:04 +02:00 · 2020-05-05 16:28:04 +02:00 · 27a6fb1a0a
commit 27a6fb1a0a
parent 1b1e5b1e52
1 changed files with 29 additions and 39 deletions
--- a/about_collector.py
+++ b/about_collector.py
@ -3,6 +3,9 @@
 # (c) roel roscam abbing 2020
 # gplv3
 import os, time, json
 import fedicrawler
 filters = [fedicrawler.only_netloc]
 scrape_data = json.loads(open('instance_scrape.json').read())
@ -14,8 +17,6 @@ if not os.path.exists(output_dir):
    if not os.path.exists(tos_dir):
        os.mkdir(tos_dir)
 def find_mastodon_instances(scrape_data):
    mastodon_instances = []   
@ -24,25 +25,11 @@ def find_mastodon_instances(scrape_data):
            if 'software' in scrape_data[i]['nodeinfo'].keys():
                if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
                    mastodon_instances.append(i)
    #mastodon_instances = fedicrawler.multi_filter(filters,mastodon_instances)
    return mastodon_instances
 def clear_cache(driver, timeout=60):
    """Clear the cookies and cache for the ChromeDriver instance."""
    # navigate to the settings page
    driver.get('chrome://settings/clearBrowserData')
    # wait for the button to appear
    wait = WebDriverWait(driver, timeout)
    wait.until(get_clear_browsing_button)
    # click the button to clear the cache
    get_clear_browsing_button(driver).click()
    # wait for the button to be gone before returning
    wait.until_not(get_clear_browsing_button)
 from selenium import webdriver # you need to set up a driver
 from selenium.webdriver.firefox.options import Options
@ -53,30 +40,33 @@ browser = webdriver.Firefox(options=options)
 #browser.set_window_size(1024, 768) # set the window size that you need 
 for mi in find_mastodon_instances(scrape_data):
    #clear_cache(browser)
-    browser.get('https://{}/about/more/'.format(mi))
+    if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')):
-    page = browser.find_element_by_tag_name('html')
+
-    height=page.size['height']
+            browser.get('https://{}/about/more/'.format(mi))
-    browser.set_window_size(1400,height+100)
+            page = browser.find_element_by_tag_name('html')
            height=page.size['height']
            browser.set_window_size(1400,height+100)
-    try:
+            try:
-        bw = browser.find_element_by_css_selector('div .box-widget')
+                bw = browser.find_element_by_css_selector('div .box-widget')
-        about_text = bw.find_element_by_css_selector('div .rich-formatting')
+                about_text = bw.find_element_by_css_selector('div .rich-formatting')
-        html = browser.execute_script('return arguments[0].innerHTML;',about_text)
+                html = browser.execute_script('return arguments[0].innerHTML;',about_text)
-        if html:
+                if html:
-            output_dir = tos_dir
+                    output_dir = tos_dir
-        with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
+                with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
-            f.write(html)
+                    f.write(html)
-    except Exception as e:
+            except Exception as e:
-        output_dir = 'about_pages'
+                output_dir = 'about_pages'
-        print(e)
+                print(e)
-    time.sleep(0.1)
+            time.sleep(0.1)
-    print('taking screenshot of', mi)
+            print('taking screenshot of', mi)
-    browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
+            browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
-    with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
+            with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
-        f.write(browser.page_source)
+                f.write(browser.page_source)
    else:
        print(mi, '✔')
    #browser.save_screenshot()
 browser.quit()