626 lines
23 KiB
Python
626 lines
23 KiB
Python
import copy
|
|
import datetime
|
|
import locale
|
|
import logging
|
|
import os
|
|
import re
|
|
from html import unescape
|
|
from urllib.parse import unquote, urljoin, urlparse, urlunparse
|
|
|
|
import pytz
|
|
|
|
from pelican.plugins import signals
|
|
from pelican.settings import DEFAULT_CONFIG
|
|
from pelican.utils import (deprecated_attribute, memoized, path_to_url,
|
|
posixize_path, sanitised_join, set_date_tzinfo,
|
|
slugify, truncate_html_words)
|
|
|
|
# Import these so that they're avalaible when you import from pelican.contents.
|
|
from pelican.urlwrappers import (Author, Category, Tag, URLWrapper) # NOQA
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Content:
|
|
"""Represents a content.
|
|
|
|
:param content: the string to parse, containing the original content.
|
|
:param metadata: the metadata associated to this page (optional).
|
|
:param settings: the settings dictionary (optional).
|
|
:param source_path: The location of the source of this content (if any).
|
|
:param context: The shared context between generators.
|
|
|
|
"""
|
|
@deprecated_attribute(old='filename', new='source_path', since=(3, 2, 0))
|
|
def filename():
|
|
return None
|
|
|
|
def __init__(self, content, metadata=None, settings=None,
|
|
source_path=None, context=None):
|
|
if metadata is None:
|
|
metadata = {}
|
|
if settings is None:
|
|
settings = copy.deepcopy(DEFAULT_CONFIG)
|
|
|
|
self.settings = settings
|
|
self._content = content
|
|
if context is None:
|
|
context = {}
|
|
self._context = context
|
|
self.translations = []
|
|
|
|
local_metadata = dict()
|
|
local_metadata.update(metadata)
|
|
|
|
# set metadata as attributes
|
|
for key, value in local_metadata.items():
|
|
if key in ('save_as', 'url'):
|
|
key = 'override_' + key
|
|
setattr(self, key.lower(), value)
|
|
|
|
# also keep track of the metadata attributes available
|
|
self.metadata = local_metadata
|
|
|
|
# default template if it's not defined in page
|
|
self.template = self._get_template()
|
|
|
|
# First, read the authors from "authors", if not, fallback to "author"
|
|
# and if not use the settings defined one, if any.
|
|
if not hasattr(self, 'author'):
|
|
if hasattr(self, 'authors'):
|
|
self.author = self.authors[0]
|
|
elif 'AUTHOR' in settings:
|
|
self.author = Author(settings['AUTHOR'], settings)
|
|
|
|
if not hasattr(self, 'authors') and hasattr(self, 'author'):
|
|
self.authors = [self.author]
|
|
|
|
# XXX Split all the following code into pieces, there is too much here.
|
|
|
|
# manage languages
|
|
self.in_default_lang = True
|
|
if 'DEFAULT_LANG' in settings:
|
|
default_lang = settings['DEFAULT_LANG'].lower()
|
|
if not hasattr(self, 'lang'):
|
|
self.lang = default_lang
|
|
|
|
self.in_default_lang = (self.lang == default_lang)
|
|
|
|
# create the slug if not existing, generate slug according to
|
|
# setting of SLUG_ATTRIBUTE
|
|
if not hasattr(self, 'slug'):
|
|
if (settings['SLUGIFY_SOURCE'] == 'title' and
|
|
hasattr(self, 'title')):
|
|
value = self.title
|
|
elif (settings['SLUGIFY_SOURCE'] == 'basename' and
|
|
source_path is not None):
|
|
value = os.path.basename(os.path.splitext(source_path)[0])
|
|
else:
|
|
value = None
|
|
if value is not None:
|
|
self.slug = slugify(
|
|
value,
|
|
regex_subs=settings.get('SLUG_REGEX_SUBSTITUTIONS', []),
|
|
preserve_case=settings.get('SLUGIFY_PRESERVE_CASE', False),
|
|
use_unicode=settings.get('SLUGIFY_USE_UNICODE', False))
|
|
|
|
self.source_path = source_path
|
|
self.relative_source_path = self.get_relative_source_path()
|
|
|
|
# manage the date format
|
|
if not hasattr(self, 'date_format'):
|
|
if hasattr(self, 'lang') and self.lang in settings['DATE_FORMATS']:
|
|
self.date_format = settings['DATE_FORMATS'][self.lang]
|
|
else:
|
|
self.date_format = settings['DEFAULT_DATE_FORMAT']
|
|
|
|
if isinstance(self.date_format, tuple):
|
|
locale_string = self.date_format[0]
|
|
locale.setlocale(locale.LC_ALL, locale_string)
|
|
self.date_format = self.date_format[1]
|
|
|
|
# manage timezone
|
|
default_timezone = settings.get('TIMEZONE', 'UTC')
|
|
timezone = getattr(self, 'timezone', default_timezone)
|
|
self.timezone = pytz.timezone(timezone)
|
|
|
|
if hasattr(self, 'date'):
|
|
self.date = set_date_tzinfo(self.date, timezone)
|
|
self.locale_date = self.date.strftime(self.date_format)
|
|
|
|
if hasattr(self, 'modified'):
|
|
self.modified = set_date_tzinfo(self.modified, timezone)
|
|
self.locale_modified = self.modified.strftime(self.date_format)
|
|
|
|
# manage status
|
|
if not hasattr(self, 'status'):
|
|
# Previous default of None broke comment plugins and perhaps others
|
|
self.status = getattr(self, 'default_status', '')
|
|
|
|
# store the summary metadata if it is set
|
|
if 'summary' in metadata:
|
|
self._summary = metadata['summary']
|
|
|
|
signals.content_object_init.send(self)
|
|
|
|
def __str__(self):
|
|
return self.source_path or repr(self)
|
|
|
|
def _has_valid_mandatory_properties(self):
|
|
"""Test mandatory properties are set."""
|
|
for prop in self.mandatory_properties:
|
|
if not hasattr(self, prop):
|
|
logger.error(
|
|
"Skipping %s: could not find information about '%s'",
|
|
self, prop)
|
|
return False
|
|
return True
|
|
|
|
def _has_valid_save_as(self):
|
|
"""Return true if save_as doesn't write outside output path, false
|
|
otherwise."""
|
|
try:
|
|
output_path = self.settings["OUTPUT_PATH"]
|
|
except KeyError:
|
|
# we cannot check
|
|
return True
|
|
|
|
try:
|
|
sanitised_join(output_path, self.save_as)
|
|
except RuntimeError: # outside output_dir
|
|
logger.error(
|
|
"Skipping %s: file %r would be written outside output path",
|
|
self,
|
|
self.save_as,
|
|
)
|
|
return False
|
|
|
|
return True
|
|
|
|
def _has_valid_status(self):
|
|
if hasattr(self, 'allowed_statuses'):
|
|
if self.status not in self.allowed_statuses:
|
|
logger.error(
|
|
"Unknown status '%s' for file %s, skipping it.",
|
|
self.status,
|
|
self
|
|
)
|
|
return False
|
|
|
|
# if undefined we allow all
|
|
return True
|
|
|
|
def is_valid(self):
|
|
"""Validate Content"""
|
|
# Use all() to not short circuit and get results of all validations
|
|
return all([self._has_valid_mandatory_properties(),
|
|
self._has_valid_save_as(),
|
|
self._has_valid_status()])
|
|
|
|
@property
|
|
def url_format(self):
|
|
"""Returns the URL, formatted with the proper values"""
|
|
metadata = copy.copy(self.metadata)
|
|
path = self.metadata.get('path', self.get_relative_source_path())
|
|
metadata.update({
|
|
'path': path_to_url(path),
|
|
'slug': getattr(self, 'slug', ''),
|
|
'lang': getattr(self, 'lang', 'en'),
|
|
'date': getattr(self, 'date', datetime.datetime.now()),
|
|
'author': self.author.slug if hasattr(self, 'author') else '',
|
|
'category': self.category.slug if hasattr(self, 'category') else ''
|
|
})
|
|
return metadata
|
|
|
|
def _expand_settings(self, key, klass=None):
|
|
if not klass:
|
|
klass = self.__class__.__name__
|
|
fq_key = ('{}_{}'.format(klass, key)).upper()
|
|
return self.settings[fq_key].format(**self.url_format)
|
|
|
|
def get_url_setting(self, key):
|
|
if hasattr(self, 'override_' + key):
|
|
return getattr(self, 'override_' + key)
|
|
key = key if self.in_default_lang else 'lang_%s' % key
|
|
return self._expand_settings(key)
|
|
|
|
def _link_replacer(self, siteurl, m):
|
|
what = m.group('what')
|
|
value = urlparse(m.group('value'))
|
|
path = value.path
|
|
origin = m.group('path')
|
|
|
|
# urllib.parse.urljoin() produces `a.html` for urljoin("..", "a.html")
|
|
# so if RELATIVE_URLS are enabled, we fall back to os.path.join() to
|
|
# properly get `../a.html`. However, os.path.join() produces
|
|
# `baz/http://foo/bar.html` for join("baz", "http://foo/bar.html")
|
|
# instead of correct "http://foo/bar.html", so one has to pick a side
|
|
# as there is no silver bullet.
|
|
if self.settings['RELATIVE_URLS']:
|
|
joiner = os.path.join
|
|
else:
|
|
joiner = urljoin
|
|
|
|
# However, it's not *that* simple: urljoin("blog", "index.html")
|
|
# produces just `index.html` instead of `blog/index.html` (unlike
|
|
# os.path.join()), so in order to get a correct answer one needs to
|
|
# append a trailing slash to siteurl in that case. This also makes
|
|
# the new behavior fully compatible with Pelican 3.7.1.
|
|
if not siteurl.endswith('/'):
|
|
siteurl += '/'
|
|
|
|
# XXX Put this in a different location.
|
|
if what in {'filename', 'static', 'attach'}:
|
|
def _get_linked_content(key, url):
|
|
nonlocal value
|
|
|
|
def _find_path(path):
|
|
if path.startswith('/'):
|
|
path = path[1:]
|
|
else:
|
|
# relative to the source path of this content
|
|
path = self.get_relative_source_path(
|
|
os.path.join(self.relative_dir, path)
|
|
)
|
|
return self._context[key].get(path, None)
|
|
|
|
# try path
|
|
result = _find_path(url.path)
|
|
if result is not None:
|
|
return result
|
|
|
|
# try unquoted path
|
|
result = _find_path(unquote(url.path))
|
|
if result is not None:
|
|
return result
|
|
|
|
# try html unescaped url
|
|
unescaped_url = urlparse(unescape(url.geturl()))
|
|
result = _find_path(unescaped_url.path)
|
|
if result is not None:
|
|
value = unescaped_url
|
|
return result
|
|
|
|
# check if a static file is linked with {filename}
|
|
if what == 'filename' and key == 'generated_content':
|
|
linked_content = _get_linked_content('static_content', value)
|
|
if linked_content:
|
|
logger.warning(
|
|
'{filename} used for linking to static'
|
|
' content %s in %s. Use {static} instead',
|
|
value.path,
|
|
self.get_relative_source_path())
|
|
return linked_content
|
|
|
|
return None
|
|
|
|
if what == 'filename':
|
|
key = 'generated_content'
|
|
else:
|
|
key = 'static_content'
|
|
|
|
linked_content = _get_linked_content(key, value)
|
|
if linked_content:
|
|
if what == 'attach':
|
|
linked_content.attach_to(self)
|
|
origin = joiner(siteurl, linked_content.url)
|
|
origin = origin.replace('\\', '/') # for Windows paths.
|
|
else:
|
|
logger.warning(
|
|
"Unable to find '%s', skipping url replacement.",
|
|
value.geturl(), extra={
|
|
'limit_msg': ("Other resources were not found "
|
|
"and their urls not replaced")})
|
|
elif what == 'category':
|
|
origin = joiner(siteurl, Category(path, self.settings).url)
|
|
elif what == 'tag':
|
|
origin = joiner(siteurl, Tag(path, self.settings).url)
|
|
elif what == 'index':
|
|
origin = joiner(siteurl, self.settings['INDEX_SAVE_AS'])
|
|
elif what == 'author':
|
|
origin = joiner(siteurl, Author(path, self.settings).url)
|
|
else:
|
|
logger.warning(
|
|
"Replacement Indicator '%s' not recognized, "
|
|
"skipping replacement",
|
|
what)
|
|
|
|
# keep all other parts, such as query, fragment, etc.
|
|
parts = list(value)
|
|
parts[2] = origin
|
|
origin = urlunparse(parts)
|
|
|
|
return ''.join((m.group('markup'), m.group('quote'), origin,
|
|
m.group('quote')))
|
|
|
|
def _get_intrasite_link_regex(self):
|
|
intrasite_link_regex = self.settings['INTRASITE_LINK_REGEX']
|
|
regex = r"""
|
|
(?P<markup><[^\>]+ # match tag with all url-value attributes
|
|
(?:href|src|poster|data|cite|formaction|action)\s*=\s*)
|
|
|
|
(?P<quote>["\']) # require value to be quoted
|
|
(?P<path>{}(?P<value>.*?)) # the url value
|
|
\2""".format(intrasite_link_regex)
|
|
return re.compile(regex, re.X)
|
|
|
|
def _update_content(self, content, siteurl):
|
|
"""Update the content attribute.
|
|
|
|
Change all the relative paths of the content to relative paths
|
|
suitable for the output content.
|
|
|
|
:param content: content resource that will be passed to the templates.
|
|
:param siteurl: siteurl which is locally generated by the writer in
|
|
case of RELATIVE_URLS.
|
|
"""
|
|
if not content:
|
|
return content
|
|
|
|
hrefs = self._get_intrasite_link_regex()
|
|
return hrefs.sub(lambda m: self._link_replacer(siteurl, m), content)
|
|
|
|
def get_static_links(self):
|
|
static_links = set()
|
|
hrefs = self._get_intrasite_link_regex()
|
|
for m in hrefs.finditer(self._content):
|
|
what = m.group('what')
|
|
value = urlparse(m.group('value'))
|
|
path = value.path
|
|
if what not in {'static', 'attach'}:
|
|
continue
|
|
if path.startswith('/'):
|
|
path = path[1:]
|
|
else:
|
|
# relative to the source path of this content
|
|
path = self.get_relative_source_path(
|
|
os.path.join(self.relative_dir, path)
|
|
)
|
|
path = path.replace('%20', ' ')
|
|
static_links.add(path)
|
|
return static_links
|
|
|
|
def get_siteurl(self):
|
|
return self._context.get('localsiteurl', '')
|
|
|
|
@memoized
|
|
def get_content(self, siteurl):
|
|
if hasattr(self, '_get_content'):
|
|
content = self._get_content()
|
|
else:
|
|
content = self._content
|
|
return self._update_content(content, siteurl)
|
|
|
|
@property
|
|
def content(self):
|
|
return self.get_content(self.get_siteurl())
|
|
|
|
@memoized
|
|
def get_summary(self, siteurl):
|
|
"""Returns the summary of an article.
|
|
|
|
This is based on the summary metadata if set, otherwise truncate the
|
|
content.
|
|
"""
|
|
if 'summary' in self.metadata:
|
|
return self.metadata['summary']
|
|
|
|
if self.settings['SUMMARY_MAX_LENGTH'] is None:
|
|
return self.content
|
|
|
|
return truncate_html_words(self.content,
|
|
self.settings['SUMMARY_MAX_LENGTH'],
|
|
self.settings['SUMMARY_END_SUFFIX'])
|
|
|
|
@property
|
|
def summary(self):
|
|
return self.get_summary(self.get_siteurl())
|
|
|
|
def _get_summary(self):
|
|
"""deprecated function to access summary"""
|
|
|
|
logger.warning('_get_summary() has been deprecated since 3.6.4. '
|
|
'Use the summary decorator instead')
|
|
return self.summary
|
|
|
|
@summary.setter
|
|
def summary(self, value):
|
|
"""Dummy function"""
|
|
pass
|
|
|
|
@property
|
|
def status(self):
|
|
return self._status
|
|
|
|
@status.setter
|
|
def status(self, value):
|
|
# TODO maybe typecheck
|
|
self._status = value.lower()
|
|
|
|
@property
|
|
def url(self):
|
|
return self.get_url_setting('url')
|
|
|
|
@property
|
|
def save_as(self):
|
|
return self.get_url_setting('save_as')
|
|
|
|
def _get_template(self):
|
|
if hasattr(self, 'template') and self.template is not None:
|
|
return self.template
|
|
else:
|
|
return self.default_template
|
|
|
|
def get_relative_source_path(self, source_path=None):
|
|
"""Return the relative path (from the content path) to the given
|
|
source_path.
|
|
|
|
If no source path is specified, use the source path of this
|
|
content object.
|
|
"""
|
|
if not source_path:
|
|
source_path = self.source_path
|
|
if source_path is None:
|
|
return None
|
|
|
|
return posixize_path(
|
|
os.path.relpath(
|
|
os.path.abspath(os.path.join(
|
|
self.settings['PATH'],
|
|
source_path)),
|
|
os.path.abspath(self.settings['PATH'])
|
|
))
|
|
|
|
@property
|
|
def relative_dir(self):
|
|
return posixize_path(
|
|
os.path.dirname(
|
|
os.path.relpath(
|
|
os.path.abspath(self.source_path),
|
|
os.path.abspath(self.settings['PATH']))))
|
|
|
|
def refresh_metadata_intersite_links(self):
|
|
for key in self.settings['FORMATTED_FIELDS']:
|
|
if key in self.metadata and key != 'summary':
|
|
value = self._update_content(
|
|
self.metadata[key],
|
|
self.get_siteurl()
|
|
)
|
|
self.metadata[key] = value
|
|
setattr(self, key.lower(), value)
|
|
|
|
# _summary is an internal variable that some plugins may be writing to,
|
|
# so ensure changes to it are picked up
|
|
if ('summary' in self.settings['FORMATTED_FIELDS'] and
|
|
'summary' in self.metadata):
|
|
self._summary = self._update_content(
|
|
self._summary,
|
|
self.get_siteurl()
|
|
)
|
|
self.metadata['summary'] = self._summary
|
|
|
|
|
|
class Page(Content):
|
|
mandatory_properties = ('title',)
|
|
allowed_statuses = ('published', 'hidden', 'draft')
|
|
default_status = 'published'
|
|
default_template = 'page'
|
|
|
|
def _expand_settings(self, key):
|
|
klass = 'draft_page' if self.status == 'draft' else None
|
|
return super()._expand_settings(key, klass)
|
|
|
|
|
|
class Article(Content):
|
|
mandatory_properties = ('title', 'date', 'category')
|
|
allowed_statuses = ('published', 'draft')
|
|
default_status = 'published'
|
|
default_template = 'article'
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
|
|
# handle WITH_FUTURE_DATES (designate article to draft based on date)
|
|
if not self.settings['WITH_FUTURE_DATES'] and hasattr(self, 'date'):
|
|
if self.date.tzinfo is None:
|
|
now = datetime.datetime.now()
|
|
else:
|
|
now = datetime.datetime.utcnow().replace(tzinfo=pytz.utc)
|
|
if self.date > now:
|
|
self.status = 'draft'
|
|
|
|
# if we are a draft and there is no date provided, set max datetime
|
|
if not hasattr(self, 'date') and self.status == 'draft':
|
|
self.date = datetime.datetime.max.replace(tzinfo=self.timezone)
|
|
|
|
def _expand_settings(self, key):
|
|
klass = 'draft' if self.status == 'draft' else 'article'
|
|
return super()._expand_settings(key, klass)
|
|
|
|
|
|
class Static(Content):
|
|
mandatory_properties = ('title',)
|
|
default_status = 'published'
|
|
default_template = None
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self._output_location_referenced = False
|
|
|
|
@deprecated_attribute(old='filepath', new='source_path', since=(3, 2, 0))
|
|
def filepath():
|
|
return None
|
|
|
|
@deprecated_attribute(old='src', new='source_path', since=(3, 2, 0))
|
|
def src():
|
|
return None
|
|
|
|
@deprecated_attribute(old='dst', new='save_as', since=(3, 2, 0))
|
|
def dst():
|
|
return None
|
|
|
|
@property
|
|
def url(self):
|
|
# Note when url has been referenced, so we can avoid overriding it.
|
|
self._output_location_referenced = True
|
|
return super().url
|
|
|
|
@property
|
|
def save_as(self):
|
|
# Note when save_as has been referenced, so we can avoid overriding it.
|
|
self._output_location_referenced = True
|
|
return super().save_as
|
|
|
|
def attach_to(self, content):
|
|
"""Override our output directory with that of the given content object.
|
|
"""
|
|
|
|
# Determine our file's new output path relative to the linking
|
|
# document. If it currently lives beneath the linking
|
|
# document's source directory, preserve that relationship on output.
|
|
# Otherwise, make it a sibling.
|
|
|
|
linking_source_dir = os.path.dirname(content.source_path)
|
|
tail_path = os.path.relpath(self.source_path, linking_source_dir)
|
|
if tail_path.startswith(os.pardir + os.sep):
|
|
tail_path = os.path.basename(tail_path)
|
|
new_save_as = os.path.join(
|
|
os.path.dirname(content.save_as), tail_path)
|
|
|
|
# We do not build our new url by joining tail_path with the linking
|
|
# document's url, because we cannot know just by looking at the latter
|
|
# whether it points to the document itself or to its parent directory.
|
|
# (An url like 'some/content' might mean a directory named 'some'
|
|
# with a file named 'content', or it might mean a directory named
|
|
# 'some/content' with a file named 'index.html'.) Rather than trying
|
|
# to figure it out by comparing the linking document's url and save_as
|
|
# path, we simply build our new url from our new save_as path.
|
|
|
|
new_url = path_to_url(new_save_as)
|
|
|
|
def _log_reason(reason):
|
|
logger.warning(
|
|
"The {attach} link in %s cannot relocate "
|
|
"%s because %s. Falling back to "
|
|
"{filename} link behavior instead.",
|
|
content.get_relative_source_path(),
|
|
self.get_relative_source_path(), reason,
|
|
extra={'limit_msg': "More {attach} warnings silenced."})
|
|
|
|
# We never override an override, because we don't want to interfere
|
|
# with user-defined overrides that might be in EXTRA_PATH_METADATA.
|
|
if hasattr(self, 'override_save_as') or hasattr(self, 'override_url'):
|
|
if new_save_as != self.save_as or new_url != self.url:
|
|
_log_reason("its output location was already overridden")
|
|
return
|
|
|
|
# We never change an output path that has already been referenced,
|
|
# because we don't want to break links that depend on that path.
|
|
if self._output_location_referenced:
|
|
if new_save_as != self.save_as or new_url != self.url:
|
|
_log_reason("another link already referenced its location")
|
|
return
|
|
|
|
self.override_save_as = new_save_as
|
|
self.override_url = new_url
|