diff --git a/page_metadata.py b/page_metadata.py index 3e83cf5..71ff405 100644 --- a/page_metadata.py +++ b/page_metadata.py @@ -43,7 +43,7 @@ def get_printable_size(byte_size): elif size_index == 0: return str(size) else: - return "{:.3f}".format(size) + return "{:.2f}".format(size) current_size = byte_size size_index = 0 @@ -56,6 +56,13 @@ def get_printable_size(byte_size): measure = MEASURE[size_index] return size + measure +def get_assets(soup): + assets = [] + for a in soup.findAll('link', {'rel':['apple-touch-icon','icon','stylesheet']}): + a = a['href'].split('?')[0] + if a not in assets: + assets.append(a) + return assets def get_media(html_file): """ @@ -64,36 +71,69 @@ def get_media(html_file): html_file = open(html_file).read() soup = BeautifulSoup(html_file, 'html.parser') media = [] + for img in soup(['img', 'object']): media.append(img['src']) - media = list(set(media)) # duplicate media don't increase page size - return media + featured_images = soup.findAll('div', {'class':'featured-img'}) + for fi in featured_images: + fi = fi['style'] + start = fi.find("url('") + end = fi.find("');") + url = fi[start+len("url('"):end] + media.append(url) + + assets = get_assets(soup) + media = list(set(media+assets)) # duplicate media don't increase page size + return media, soup def generate_metadata(path, context): output_path = context['OUTPUT_PATH'] output_file = context['output_file'] + siteurl = context['SITEURL'] + plugins = context['PLUGINS'] + subsites = False + + if 'i18n_subsites' in plugins: + subsites = True + lang = context['DEFAULT_LANG'] + general_output_path = output_path.replace(lang, '').strip('/') + siteurl = siteurl.replace(lang,'').strip('/') + media_size = 0 # enumerate all media displayed on the page - for m in get_media(path): - m = os.path.join(output_path, m) + + media, soup = get_media(path) #reuse the same soup to limit calculation + + for m in media: + # filter out SITEURL to prevent trouble - m = m.replace(context['SITEURL']+'/', '') + file_name = m.replace(context['SITEURL']+'/', '') + # join output path to file, need to strip any leading slash for os.path + if subsites: + m = os.path.join(general_output_path, file_name.strip('/')) + else: + m = os.path.join(output_path, file_name.strip('/')) + if os.path.exists(m): media_size = media_size + os.path.getsize(m) current_file = os.path.join(output_path, output_file) file_size = os.path.getsize(current_file) - with open(current_file, 'a') as f: - file_size = file_size + media_size - metadata = output_file + ' ' + context['NOW'] + ' ' + get_printable_size(file_size) - metadata = output_file + ' ' + context['NOW'] + ' ' + get_printable_size(file_size+len(metadata)) # cursed code is cursed - f.write(metadata) + file_size = file_size + media_size + metadata = get_printable_size(file_size) + metadata = get_printable_size(file_size+len(metadata)) # cursed code is cursed - # TODO: add a way to nicely insert the meta-data into an element with id + insert_metadata(path, metadata, soup) +def insert_metadata(output_file, metadata, soup): + tag = soup.find('div', {'id':'page-size'}) + if tag: + with open(output_file,'w') as f: + tag.string = '{}'.format(metadata) + f.write(str(soup)) def register(): signals.content_written.connect(generate_metadata)