diff --git a/about_collector.py b/about_collector.py index b9cabda..03ceb2b 100644 --- a/about_collector.py +++ b/about_collector.py @@ -3,6 +3,9 @@ # (c) roel roscam abbing 2020 # gplv3 import os, time, json +import fedicrawler + +filters = [fedicrawler.only_netloc] scrape_data = json.loads(open('instance_scrape.json').read()) @@ -14,8 +17,6 @@ if not os.path.exists(output_dir): if not os.path.exists(tos_dir): os.mkdir(tos_dir) - - def find_mastodon_instances(scrape_data): mastodon_instances = [] @@ -24,23 +25,9 @@ def find_mastodon_instances(scrape_data): if 'software' in scrape_data[i]['nodeinfo'].keys(): if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': mastodon_instances.append(i) - return mastodon_instances - -def clear_cache(driver, timeout=60): - """Clear the cookies and cache for the ChromeDriver instance.""" - # navigate to the settings page - driver.get('chrome://settings/clearBrowserData') - - # wait for the button to appear - wait = WebDriverWait(driver, timeout) - wait.until(get_clear_browsing_button) - - # click the button to clear the cache - get_clear_browsing_button(driver).click() - - # wait for the button to be gone before returning - wait.until_not(get_clear_browsing_button) + #mastodon_instances = fedicrawler.multi_filter(filters,mastodon_instances) + return mastodon_instances from selenium import webdriver # you need to set up a driver @@ -53,30 +40,33 @@ browser = webdriver.Firefox(options=options) #browser.set_window_size(1024, 768) # set the window size that you need for mi in find_mastodon_instances(scrape_data): - #clear_cache(browser) - - browser.get('https://{}/about/more/'.format(mi)) - page = browser.find_element_by_tag_name('html') - height=page.size['height'] - browser.set_window_size(1400,height+100) - - - try: - bw = browser.find_element_by_css_selector('div .box-widget') - about_text = bw.find_element_by_css_selector('div .rich-formatting') - html = browser.execute_script('return arguments[0].innerHTML;',about_text) - if html: - output_dir = tos_dir - with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: - f.write(html) - except Exception as e: - output_dir = 'about_pages' - print(e) - - time.sleep(0.1) - print('taking screenshot of', mi) - browser.save_screenshot(os.path.join(output_dir,mi+'.png')) - with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: - f.write(browser.page_source) + + if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')): + + browser.get('https://{}/about/more/'.format(mi)) + page = browser.find_element_by_tag_name('html') + height=page.size['height'] + browser.set_window_size(1400,height+100) + + + try: + bw = browser.find_element_by_css_selector('div .box-widget') + about_text = bw.find_element_by_css_selector('div .rich-formatting') + html = browser.execute_script('return arguments[0].innerHTML;',about_text) + if html: + output_dir = tos_dir + with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: + f.write(html) + except Exception as e: + output_dir = 'about_pages' + print(e) + + time.sleep(0.1) + print('taking screenshot of', mi) + browser.save_screenshot(os.path.join(output_dir,mi+'.png')) + with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: + f.write(browser.page_source) + else: + print(mi, '✔') #browser.save_screenshot() browser.quit()