From 27a5c9b1d75ff87196fab913b0ca0401fe31f88c Mon Sep 17 00:00:00 2001 From: rra Date: Wed, 6 May 2020 09:52:38 +0200 Subject: [PATCH] minor tweaks --- about_collector.py | 8 ++++++-- fedicrawler.py | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/about_collector.py b/about_collector.py index cf6c021..f324bf9 100644 --- a/about_collector.py +++ b/about_collector.py @@ -41,7 +41,8 @@ browser = webdriver.Firefox(options=options) for mi in find_mastodon_instances(scrape_data): if not os.path.exists(os.path.join(output_dir,mi+'.png')) and not os.path.exists(os.path.join(tos_dir,mi+'.png')): - + try: + print('🡓', mi) browser.get('https://{}/about/more/'.format(mi)) page = browser.find_element_by_tag_name('html') height=page.size['height'] @@ -61,10 +62,13 @@ for mi in find_mastodon_instances(scrape_data): print(e) time.sleep(0.1) - print('taking screenshot of', mi) + print('📷', mi) browser.save_screenshot(os.path.join(output_dir,mi+'.png')) with open(os.path.join(output_dir,mi+'_about.html'),'w') as f: f.write(browser.page_source) + print(mi,'✔') + except Exception as e: + print(e) else: print(mi, '✔') #browser.save_screenshot() diff --git a/fedicrawler.py b/fedicrawler.py index de719ae..15816d3 100644 --- a/fedicrawler.py +++ b/fedicrawler.py @@ -35,7 +35,9 @@ def only_netloc(instance): #some peerlists return stuff like #mastodon.social/users/blabla or #domain.tld/friendica which are all invalid - return urlparse('https://'+instance).netloc + netloc = urlparse('https://'+instance).netloc + print(netloc) + return netloc def multi_filter(fs, l):