rra
5 years ago
2 changed files with 84 additions and 0 deletions
@ -1 +1,3 @@ |
|||
*.pyc |
|||
about_pages/ |
|||
*.log |
|||
|
@ -0,0 +1,82 @@ |
|||
#!/bin/env python3 |
|||
# a tool for collecting mastodon /about/ & ToS and CoC pages. |
|||
# (c) roel roscam abbing 2020 |
|||
# gplv3 |
|||
import os, time, json |
|||
|
|||
scrape_data = json.loads(open('instance_scrape.json').read()) |
|||
|
|||
output_dir = 'about_pages' |
|||
tos_dir = os.path.join(output_dir,'with_tos') |
|||
|
|||
if not os.path.exists(output_dir): |
|||
os.mkdir(output_dir) |
|||
if not os.path.exists(tos_dir): |
|||
os.mkdir(tos_dir) |
|||
|
|||
|
|||
|
|||
def find_mastodon_instances(scrape_data): |
|||
mastodon_instances = [] |
|||
|
|||
for i in scrape_data: |
|||
if 'nodeinfo' in scrape_data[i].keys(): |
|||
if 'software' in scrape_data[i]['nodeinfo'].keys(): |
|||
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': |
|||
mastodon_instances.append(i) |
|||
return mastodon_instances |
|||
|
|||
|
|||
def clear_cache(driver, timeout=60): |
|||
"""Clear the cookies and cache for the ChromeDriver instance.""" |
|||
# navigate to the settings page |
|||
driver.get('chrome://settings/clearBrowserData') |
|||
|
|||
# wait for the button to appear |
|||
wait = WebDriverWait(driver, timeout) |
|||
wait.until(get_clear_browsing_button) |
|||
|
|||
# click the button to clear the cache |
|||
get_clear_browsing_button(driver).click() |
|||
|
|||
# wait for the button to be gone before returning |
|||
wait.until_not(get_clear_browsing_button) |
|||
|
|||
|
|||
from selenium import webdriver # you need to set up a driver |
|||
from selenium.webdriver.firefox.options import Options |
|||
|
|||
options = Options() |
|||
options.add_argument('-headless') |
|||
|
|||
browser = webdriver.Firefox(options=options) |
|||
#browser.set_window_size(1024, 768) # set the window size that you need |
|||
|
|||
for mi in find_mastodon_instances(scrape_data): |
|||
#clear_cache(browser) |
|||
|
|||
browser.get('https://{}/about/more/'.format(mi)) |
|||
page = browser.find_element_by_tag_name('html') |
|||
height=page.size['height'] |
|||
browser.set_window_size(1400,height+100) |
|||
|
|||
|
|||
try: |
|||
bw = browser.find_element_by_css_selector('div .box-widget') |
|||
about_text = bw.find_element_by_css_selector('div .rich-formatting') |
|||
html = browser.execute_script('return arguments[0].innerHTML;',about_text) |
|||
if html: |
|||
output_dir = tos_dir |
|||
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: |
|||
f.write(html) |
|||
except Exception as e: |
|||
output_dir = 'about_pages' |
|||
print(e) |
|||
|
|||
time.sleep(0.1) |
|||
print('taking screenshot of', mi) |
|||
browser.save_screenshot(os.path.join(output_dir,mi+'.png')) |
|||
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: |
|||
f.write(browser.page_source) |
|||
#browser.save_screenshot() |
|||
browser.quit() |
Loading…
Reference in new issue