|
|
@ -3,6 +3,9 @@ |
|
|
|
# (c) roel roscam abbing 2020 |
|
|
|
# gplv3 |
|
|
|
import os, time, json |
|
|
|
import fedicrawler |
|
|
|
|
|
|
|
filters = [fedicrawler.only_netloc] |
|
|
|
|
|
|
|
scrape_data = json.loads(open('instance_scrape.json').read()) |
|
|
|
|
|
|
@ -14,8 +17,6 @@ if not os.path.exists(output_dir): |
|
|
|
if not os.path.exists(tos_dir): |
|
|
|
os.mkdir(tos_dir) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_mastodon_instances(scrape_data): |
|
|
|
mastodon_instances = [] |
|
|
|
|
|
|
@ -24,23 +25,9 @@ def find_mastodon_instances(scrape_data): |
|
|
|
if 'software' in scrape_data[i]['nodeinfo'].keys(): |
|
|
|
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': |
|
|
|
mastodon_instances.append(i) |
|
|
|
return mastodon_instances |
|
|
|
|
|
|
|
|
|
|
|
def clear_cache(driver, timeout=60): |
|
|
|
"""Clear the cookies and cache for the ChromeDriver instance.""" |
|
|
|
# navigate to the settings page |
|
|
|
driver.get('chrome://settings/clearBrowserData') |
|
|
|
|
|
|
|
# wait for the button to appear |
|
|
|
wait = WebDriverWait(driver, timeout) |
|
|
|
wait.until(get_clear_browsing_button) |
|
|
|
|
|
|
|
# click the button to clear the cache |
|
|
|
get_clear_browsing_button(driver).click() |
|
|
|
|
|
|
|
# wait for the button to be gone before returning |
|
|
|
wait.until_not(get_clear_browsing_button) |
|
|
|
#mastodon_instances = fedicrawler.multi_filter(filters,mastodon_instances) |
|
|
|
return mastodon_instances |
|
|
|
|
|
|
|
|
|
|
|
from selenium import webdriver # you need to set up a driver |
|
|
@ -53,30 +40,33 @@ browser = webdriver.Firefox(options=options) |
|
|
|
#browser.set_window_size(1024, 768) # set the window size that you need |
|
|
|
|
|
|
|
for mi in find_mastodon_instances(scrape_data): |
|
|
|
#clear_cache(browser) |
|
|
|
|
|
|
|
browser.get('https://{}/about/more/'.format(mi)) |
|
|
|
page = browser.find_element_by_tag_name('html') |
|
|
|
height=page.size['height'] |
|
|
|
browser.set_window_size(1400,height+100) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
bw = browser.find_element_by_css_selector('div .box-widget') |
|
|
|
about_text = bw.find_element_by_css_selector('div .rich-formatting') |
|
|
|
html = browser.execute_script('return arguments[0].innerHTML;',about_text) |
|
|
|
if html: |
|
|
|
output_dir = tos_dir |
|
|
|
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: |
|
|
|
f.write(html) |
|
|
|
except Exception as e: |
|
|
|
output_dir = 'about_pages' |
|
|
|
print(e) |
|
|
|
|
|
|
|
time.sleep(0.1) |
|
|
|
print('taking screenshot of', mi) |
|
|
|
browser.save_screenshot(os.path.join(output_dir,mi+'.png')) |
|
|
|
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: |
|
|
|
f.write(browser.page_source) |
|
|
|
|
|
|
|
if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')): |
|
|
|
|
|
|
|
browser.get('https://{}/about/more/'.format(mi)) |
|
|
|
page = browser.find_element_by_tag_name('html') |
|
|
|
height=page.size['height'] |
|
|
|
browser.set_window_size(1400,height+100) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
bw = browser.find_element_by_css_selector('div .box-widget') |
|
|
|
about_text = bw.find_element_by_css_selector('div .rich-formatting') |
|
|
|
html = browser.execute_script('return arguments[0].innerHTML;',about_text) |
|
|
|
if html: |
|
|
|
output_dir = tos_dir |
|
|
|
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: |
|
|
|
f.write(html) |
|
|
|
except Exception as e: |
|
|
|
output_dir = 'about_pages' |
|
|
|
print(e) |
|
|
|
|
|
|
|
time.sleep(0.1) |
|
|
|
print('taking screenshot of', mi) |
|
|
|
browser.save_screenshot(os.path.join(output_dir,mi+'.png')) |
|
|
|
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: |
|
|
|
f.write(browser.page_source) |
|
|
|
else: |
|
|
|
print(mi, '✔') |
|
|
|
#browser.save_screenshot() |
|
|
|
browser.quit() |
|
|
|