Browse Source

now continues where it left off last time

master
rra 5 years ago
parent
commit
27a6fb1a0a
  1. 76
      about_collector.py

76
about_collector.py

@ -3,6 +3,9 @@
# (c) roel roscam abbing 2020 # (c) roel roscam abbing 2020
# gplv3 # gplv3
import os, time, json import os, time, json
import fedicrawler
filters = [fedicrawler.only_netloc]
scrape_data = json.loads(open('instance_scrape.json').read()) scrape_data = json.loads(open('instance_scrape.json').read())
@ -14,8 +17,6 @@ if not os.path.exists(output_dir):
if not os.path.exists(tos_dir): if not os.path.exists(tos_dir):
os.mkdir(tos_dir) os.mkdir(tos_dir)
def find_mastodon_instances(scrape_data): def find_mastodon_instances(scrape_data):
mastodon_instances = [] mastodon_instances = []
@ -24,23 +25,9 @@ def find_mastodon_instances(scrape_data):
if 'software' in scrape_data[i]['nodeinfo'].keys(): if 'software' in scrape_data[i]['nodeinfo'].keys():
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon': if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
mastodon_instances.append(i) mastodon_instances.append(i)
return mastodon_instances
def clear_cache(driver, timeout=60): #mastodon_instances = fedicrawler.multi_filter(filters,mastodon_instances)
"""Clear the cookies and cache for the ChromeDriver instance.""" return mastodon_instances
# navigate to the settings page
driver.get('chrome://settings/clearBrowserData')
# wait for the button to appear
wait = WebDriverWait(driver, timeout)
wait.until(get_clear_browsing_button)
# click the button to clear the cache
get_clear_browsing_button(driver).click()
# wait for the button to be gone before returning
wait.until_not(get_clear_browsing_button)
from selenium import webdriver # you need to set up a driver from selenium import webdriver # you need to set up a driver
@ -53,30 +40,33 @@ browser = webdriver.Firefox(options=options)
#browser.set_window_size(1024, 768) # set the window size that you need #browser.set_window_size(1024, 768) # set the window size that you need
for mi in find_mastodon_instances(scrape_data): for mi in find_mastodon_instances(scrape_data):
#clear_cache(browser)
if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')):
browser.get('https://{}/about/more/'.format(mi))
page = browser.find_element_by_tag_name('html') browser.get('https://{}/about/more/'.format(mi))
height=page.size['height'] page = browser.find_element_by_tag_name('html')
browser.set_window_size(1400,height+100) height=page.size['height']
browser.set_window_size(1400,height+100)
try:
bw = browser.find_element_by_css_selector('div .box-widget') try:
about_text = bw.find_element_by_css_selector('div .rich-formatting') bw = browser.find_element_by_css_selector('div .box-widget')
html = browser.execute_script('return arguments[0].innerHTML;',about_text) about_text = bw.find_element_by_css_selector('div .rich-formatting')
if html: html = browser.execute_script('return arguments[0].innerHTML;',about_text)
output_dir = tos_dir if html:
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f: output_dir = tos_dir
f.write(html) with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
except Exception as e: f.write(html)
output_dir = 'about_pages' except Exception as e:
print(e) output_dir = 'about_pages'
print(e)
time.sleep(0.1)
print('taking screenshot of', mi) time.sleep(0.1)
browser.save_screenshot(os.path.join(output_dir,mi+'.png')) print('taking screenshot of', mi)
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
f.write(browser.page_source) with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
f.write(browser.page_source)
else:
print(mi, '')
#browser.save_screenshot() #browser.save_screenshot()
browser.quit() browser.quit()

Loading…
Cancel
Save