now continues where it left off last time
This commit is contained in:
parent
1b1e5b1e52
commit
27a6fb1a0a
@ -3,6 +3,9 @@
|
|||||||
# (c) roel roscam abbing 2020
|
# (c) roel roscam abbing 2020
|
||||||
# gplv3
|
# gplv3
|
||||||
import os, time, json
|
import os, time, json
|
||||||
|
import fedicrawler
|
||||||
|
|
||||||
|
filters = [fedicrawler.only_netloc]
|
||||||
|
|
||||||
scrape_data = json.loads(open('instance_scrape.json').read())
|
scrape_data = json.loads(open('instance_scrape.json').read())
|
||||||
|
|
||||||
@ -14,8 +17,6 @@ if not os.path.exists(output_dir):
|
|||||||
if not os.path.exists(tos_dir):
|
if not os.path.exists(tos_dir):
|
||||||
os.mkdir(tos_dir)
|
os.mkdir(tos_dir)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def find_mastodon_instances(scrape_data):
|
def find_mastodon_instances(scrape_data):
|
||||||
mastodon_instances = []
|
mastodon_instances = []
|
||||||
|
|
||||||
@ -24,25 +25,11 @@ def find_mastodon_instances(scrape_data):
|
|||||||
if 'software' in scrape_data[i]['nodeinfo'].keys():
|
if 'software' in scrape_data[i]['nodeinfo'].keys():
|
||||||
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
|
if scrape_data[i]['nodeinfo']['software']['name'].lower() == 'mastodon':
|
||||||
mastodon_instances.append(i)
|
mastodon_instances.append(i)
|
||||||
|
|
||||||
|
#mastodon_instances = fedicrawler.multi_filter(filters,mastodon_instances)
|
||||||
return mastodon_instances
|
return mastodon_instances
|
||||||
|
|
||||||
|
|
||||||
def clear_cache(driver, timeout=60):
|
|
||||||
"""Clear the cookies and cache for the ChromeDriver instance."""
|
|
||||||
# navigate to the settings page
|
|
||||||
driver.get('chrome://settings/clearBrowserData')
|
|
||||||
|
|
||||||
# wait for the button to appear
|
|
||||||
wait = WebDriverWait(driver, timeout)
|
|
||||||
wait.until(get_clear_browsing_button)
|
|
||||||
|
|
||||||
# click the button to clear the cache
|
|
||||||
get_clear_browsing_button(driver).click()
|
|
||||||
|
|
||||||
# wait for the button to be gone before returning
|
|
||||||
wait.until_not(get_clear_browsing_button)
|
|
||||||
|
|
||||||
|
|
||||||
from selenium import webdriver # you need to set up a driver
|
from selenium import webdriver # you need to set up a driver
|
||||||
from selenium.webdriver.firefox.options import Options
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
|
||||||
@ -53,30 +40,33 @@ browser = webdriver.Firefox(options=options)
|
|||||||
#browser.set_window_size(1024, 768) # set the window size that you need
|
#browser.set_window_size(1024, 768) # set the window size that you need
|
||||||
|
|
||||||
for mi in find_mastodon_instances(scrape_data):
|
for mi in find_mastodon_instances(scrape_data):
|
||||||
#clear_cache(browser)
|
|
||||||
|
|
||||||
browser.get('https://{}/about/more/'.format(mi))
|
if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')):
|
||||||
page = browser.find_element_by_tag_name('html')
|
|
||||||
height=page.size['height']
|
browser.get('https://{}/about/more/'.format(mi))
|
||||||
browser.set_window_size(1400,height+100)
|
page = browser.find_element_by_tag_name('html')
|
||||||
|
height=page.size['height']
|
||||||
|
browser.set_window_size(1400,height+100)
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
bw = browser.find_element_by_css_selector('div .box-widget')
|
bw = browser.find_element_by_css_selector('div .box-widget')
|
||||||
about_text = bw.find_element_by_css_selector('div .rich-formatting')
|
about_text = bw.find_element_by_css_selector('div .rich-formatting')
|
||||||
html = browser.execute_script('return arguments[0].innerHTML;',about_text)
|
html = browser.execute_script('return arguments[0].innerHTML;',about_text)
|
||||||
if html:
|
if html:
|
||||||
output_dir = tos_dir
|
output_dir = tos_dir
|
||||||
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
|
with open(os.path.join(output_dir,mi+'_tos.txt'),'w') as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
output_dir = 'about_pages'
|
output_dir = 'about_pages'
|
||||||
print(e)
|
print(e)
|
||||||
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
print('taking screenshot of', mi)
|
print('taking screenshot of', mi)
|
||||||
browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
|
browser.save_screenshot(os.path.join(output_dir,mi+'.png'))
|
||||||
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
|
with open(os.path.join(output_dir,mi+'_about.html'),'w') as f:
|
||||||
f.write(browser.page_source)
|
f.write(browser.page_source)
|
||||||
|
else:
|
||||||
|
print(mi, '✔')
|
||||||
#browser.save_screenshot()
|
#browser.save_screenshot()
|
||||||
browser.quit()
|
browser.quit()
|
||||||
|
Loading…
Reference in New Issue
Block a user