varia.website/venv/lib/python3.11/site-packages/webencodings/__init__.py

# coding: utf-8
"""

    webencodings
    ~~~~~~~~~~~~

    This is a Python implementation of the `WHATWG Encoding standard
    <http://encoding.spec.whatwg.org/>`. See README for details.

    :copyright: Copyright 2012 by Simon Sapin
    :license: BSD, see LICENSE for details.

"""

from __future__ import unicode_literals

import codecs

from .labels import LABELS


VERSION = '0.5.1'


# Some names in Encoding are not valid Python aliases. Remap these.
PYTHON_NAMES = {
    'iso-8859-8-i': 'iso-8859-8',
    'x-mac-cyrillic': 'mac-cyrillic',
    'macintosh': 'mac-roman',
    'windows-874': 'cp874'}

CACHE = {}


def ascii_lower(string):
    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.

    :param string: An Unicode string.
    :returns: A new Unicode string.

    This is used for `ASCII case-insensitive
    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
    matching of encoding labels.
    The same matching is also used, among other things,
    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.

    This is different from the :meth:`~py:str.lower` method of Unicode strings
    which also affect non-ASCII characters,
    sometimes mapping them into the ASCII range:

        >>> keyword = u'Bac\N{KELVIN SIGN}ground'
        >>> assert keyword.lower() == u'background'
        >>> assert ascii_lower(keyword) != keyword.lower()
        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'

    """
    # This turns out to be faster than unicode.translate()
    return string.encode('utf8').lower().decode('utf8')


def lookup(label):
    """
    Look for an encoding by its label.
    This is the spec’s `get an encoding
    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
    Supported labels are listed there.

    :param label: A string.
    :returns:
        An :class:`Encoding` object, or :obj:`None` for an unknown label.

    """
    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
    label = ascii_lower(label.strip('\t\n\f\r '))
    name = LABELS.get(label)
    if name is None:
        return None
    encoding = CACHE.get(name)
    if encoding is None:
        if name == 'x-user-defined':
            from .x_user_defined import codec_info
        else:
            python_name = PYTHON_NAMES.get(name, name)
            # Any python_name value that gets to here should be valid.
            codec_info = codecs.lookup(python_name)
        encoding = Encoding(name, codec_info)
        CACHE[name] = encoding
    return encoding


def _get_encoding(encoding_or_label):
    """
    Accept either an encoding object or label.

    :param encoding: An :class:`Encoding` object or a label string.
    :returns: An :class:`Encoding` object.
    :raises: :exc:`~exceptions.LookupError` for an unknown label.

    """
    if hasattr(encoding_or_label, 'codec_info'):
        return encoding_or_label

    encoding = lookup(encoding_or_label)
    if encoding is None:
        raise LookupError('Unknown encoding label: %r' % encoding_or_label)
    return encoding


class Encoding(object):
    """Reresents a character encoding such as UTF-8,
    that can be used for decoding or encoding.

    .. attribute:: name

        Canonical name of the encoding

    .. attribute:: codec_info

        The actual implementation of the encoding,
        a stdlib :class:`~codecs.CodecInfo` object.
        See :func:`codecs.register`.

    """
    def __init__(self, name, codec_info):
        self.name = name
        self.codec_info = codec_info

    def __repr__(self):
        return '<Encoding %s>' % self.name


#: The UTF-8 encoding. Should be used for new content and formats.
UTF8 = lookup('utf-8')

_UTF16LE = lookup('utf-16le')
_UTF16BE = lookup('utf-16be')


def decode(input, fallback_encoding, errors='replace'):
    """
    Decode a single string.

    :param input: A byte string
    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :return:
        A ``(output, encoding)`` tuple of an Unicode string
        and an :obj:`Encoding`.

    """
    # Fail early if `encoding` is an invalid label.
    fallback_encoding = _get_encoding(fallback_encoding)
    bom_encoding, input = _detect_bom(input)
    encoding = bom_encoding or fallback_encoding
    return encoding.codec_info.decode(input, errors)[0], encoding


def _detect_bom(input):
    """Return (bom_encoding, input), with any BOM removed from the input."""
    if input.startswith(b'\xFF\xFE'):
        return _UTF16LE, input[2:]
    if input.startswith(b'\xFE\xFF'):
        return _UTF16BE, input[2:]
    if input.startswith(b'\xEF\xBB\xBF'):
        return UTF8, input[3:]
    return None, input


def encode(input, encoding=UTF8, errors='strict'):
    """
    Encode a single string.

    :param input: An Unicode string.
    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :return: A byte string.

    """
    return _get_encoding(encoding).codec_info.encode(input, errors)[0]


def iter_decode(input, fallback_encoding, errors='replace'):
    """
    "Pull"-based decoder.

    :param input:
        An iterable of byte strings.

        The input is first consumed just enough to determine the encoding
        based on the precense of a BOM,
        then consumed on demand when the return value is.
    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :returns:
        An ``(output, encoding)`` tuple.
        :obj:`output` is an iterable of Unicode strings,
        :obj:`encoding` is the :obj:`Encoding` that is being used.

    """

    decoder = IncrementalDecoder(fallback_encoding, errors)
    generator = _iter_decode_generator(input, decoder)
    encoding = next(generator)
    return generator, encoding


def _iter_decode_generator(input, decoder):
    """Return a generator that first yields the :obj:`Encoding`,
    then yields output chukns as Unicode strings.

    """
    decode = decoder.decode
    input = iter(input)
    for chunck in input:
        output = decode(chunck)
        if output:
            assert decoder.encoding is not None
            yield decoder.encoding
            yield output
            break
    else:
        # Input exhausted without determining the encoding
        output = decode(b'', final=True)
        assert decoder.encoding is not None
        yield decoder.encoding
        if output:
            yield output
        return

    for chunck in input:
        output = decode(chunck)
        if output:
            yield output
    output = decode(b'', final=True)
    if output:
        yield output


def iter_encode(input, encoding=UTF8, errors='strict'):
    """
    “Pull”-based encoder.

    :param input: An iterable of Unicode strings.
    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    :returns: An iterable of byte strings.

    """
    # Fail early if `encoding` is an invalid label.
    encode = IncrementalEncoder(encoding, errors).encode
    return _iter_encode_generator(input, encode)


def _iter_encode_generator(input, encode):
    for chunck in input:
        output = encode(chunck)
        if output:
            yield output
    output = encode('', final=True)
    if output:
        yield output


class IncrementalDecoder(object):
    """
    “Push”-based decoder.

    :param fallback_encoding:
        An :class:`Encoding` object or a label string.
        The encoding to use if :obj:`input` does note have a BOM.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

    """
    def __init__(self, fallback_encoding, errors='replace'):
        # Fail early if `encoding` is an invalid label.
        self._fallback_encoding = _get_encoding(fallback_encoding)
        self._errors = errors
        self._buffer = b''
        self._decoder = None
        #: The actual :class:`Encoding` that is being used,
        #: or :obj:`None` if that is not determined yet.
        #: (Ie. if there is not enough input yet to determine
        #: if there is a BOM.)
        self.encoding = None  # Not known yet.

    def decode(self, input, final=False):
        """Decode one chunk of the input.

        :param input: A byte string.
        :param final:
            Indicate that no more input is available.
            Must be :obj:`True` if this is the last call.
        :returns: An Unicode string.

        """
        decoder = self._decoder
        if decoder is not None:
            return decoder(input, final)

        input = self._buffer + input
        encoding, input = _detect_bom(input)
        if encoding is None:
            if len(input) < 3 and not final:  # Not enough data yet.
                self._buffer = input
                return ''
            else:  # No BOM
                encoding = self._fallback_encoding
        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
        self._decoder = decoder
        self.encoding = encoding
        return decoder(input, final)


class IncrementalEncoder(object):
    """
    “Push”-based encoder.

    :param encoding: An :class:`Encoding` object or a label string.
    :param errors: Type of error handling. See :func:`codecs.register`.
    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

    .. method:: encode(input, final=False)

        :param input: An Unicode string.
        :param final:
            Indicate that no more input is available.
            Must be :obj:`True` if this is the last call.
        :returns: A byte string.

    """
    def __init__(self, encoding=UTF8, errors='strict'):
        encoding = _get_encoding(encoding)
        self.encode = encoding.codec_info.incrementalencoder(errors).encode
added declarations 3 days ago			`# coding: utf-8`
			`"""`

			`webencodings`
			`~~~~~~~~~~~~`

			This is a Python implementation of the `WHATWG Encoding standard
			<http://encoding.spec.whatwg.org/>`. See README for details.

			`:copyright: Copyright 2012 by Simon Sapin`
			`:license: BSD, see LICENSE for details.`

			`"""`

			`from __future__ import unicode_literals`

			`import codecs`

			`from .labels import LABELS`


			`VERSION = '0.5.1'`


			`# Some names in Encoding are not valid Python aliases. Remap these.`
			`PYTHON_NAMES = {`
			`'iso-8859-8-i': 'iso-8859-8',`
			`'x-mac-cyrillic': 'mac-cyrillic',`
			`'macintosh': 'mac-roman',`
			`'windows-874': 'cp874'}`

			`CACHE = {}`


			`def ascii_lower(string):`
			`r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.`

			`:param string: An Unicode string.`
			`:returns: A new Unicode string.`

			This is used for `ASCII case-insensitive
			<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
			`matching of encoding labels.`
			`The same matching is also used, among other things,`
			for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.

			This is different from the :meth:`~py:str.lower` method of Unicode strings
			`which also affect non-ASCII characters,`
			`sometimes mapping them into the ASCII range:`

			`>>> keyword = u'Bac\N{KELVIN SIGN}ground'`
			`>>> assert keyword.lower() == u'background'`
			`>>> assert ascii_lower(keyword) != keyword.lower()`
			`>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'`

			`"""`
			`# This turns out to be faster than unicode.translate()`
			`return string.encode('utf8').lower().decode('utf8')`


			`def lookup(label):`
			`"""`
			`Look for an encoding by its label.`
			This is the spec’s `get an encoding
			<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
			`Supported labels are listed there.`

			`:param label: A string.`
			`:returns:`
			An :class:`Encoding` object, or :obj:`None` for an unknown label.

			`"""`
			`# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.`
			`label = ascii_lower(label.strip('\t\n\f\r '))`
			`name = LABELS.get(label)`
			`if name is None:`
			`return None`
			`encoding = CACHE.get(name)`
			`if encoding is None:`
			`if name == 'x-user-defined':`
			`from .x_user_defined import codec_info`
			`else:`
			`python_name = PYTHON_NAMES.get(name, name)`
			`# Any python_name value that gets to here should be valid.`
			`codec_info = codecs.lookup(python_name)`
			`encoding = Encoding(name, codec_info)`
			`CACHE[name] = encoding`
			`return encoding`


			`def _get_encoding(encoding_or_label):`
			`"""`
			`Accept either an encoding object or label.`

			:param encoding: An :class:`Encoding` object or a label string.
			:returns: An :class:`Encoding` object.
			:raises: :exc:`~exceptions.LookupError` for an unknown label.

			`"""`
			`if hasattr(encoding_or_label, 'codec_info'):`
			`return encoding_or_label`

			`encoding = lookup(encoding_or_label)`
			`if encoding is None:`
			`raise LookupError('Unknown encoding label: %r' % encoding_or_label)`
			`return encoding`


			`class Encoding(object):`
			`"""Reresents a character encoding such as UTF-8,`
			`that can be used for decoding or encoding.`

			`.. attribute:: name`

			`Canonical name of the encoding`

			`.. attribute:: codec_info`

			`The actual implementation of the encoding,`
			a stdlib :class:`~codecs.CodecInfo` object.
			See :func:`codecs.register`.

			`"""`
			`def __init__(self, name, codec_info):`
			`self.name = name`
			`self.codec_info = codec_info`

			`def __repr__(self):`
			`return '<Encoding %s>' % self.name`


			`#: The UTF-8 encoding. Should be used for new content and formats.`
			`UTF8 = lookup('utf-8')`

			`_UTF16LE = lookup('utf-16le')`
			`_UTF16BE = lookup('utf-16be')`


			`def decode(input, fallback_encoding, errors='replace'):`
			`"""`
			`Decode a single string.`

			`:param input: A byte string`
			`:param fallback_encoding:`
			An :class:`Encoding` object or a label string.
			The encoding to use if :obj:`input` does note have a BOM.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
			`:return:`
			A ``(output, encoding)`` tuple of an Unicode string
			and an :obj:`Encoding`.

			`"""`
			# Fail early if `encoding` is an invalid label.
			`fallback_encoding = _get_encoding(fallback_encoding)`
			`bom_encoding, input = _detect_bom(input)`
			`encoding = bom_encoding or fallback_encoding`
			`return encoding.codec_info.decode(input, errors)[0], encoding`


			`def _detect_bom(input):`
			`"""Return (bom_encoding, input), with any BOM removed from the input."""`
			`if input.startswith(b'\xFF\xFE'):`
			`return _UTF16LE, input[2:]`
			`if input.startswith(b'\xFE\xFF'):`
			`return _UTF16BE, input[2:]`
			`if input.startswith(b'\xEF\xBB\xBF'):`
			`return UTF8, input[3:]`
			`return None, input`


			`def encode(input, encoding=UTF8, errors='strict'):`
			`"""`
			`Encode a single string.`

			`:param input: An Unicode string.`
			:param encoding: An :class:`Encoding` object or a label string.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
			`:return: A byte string.`

			`"""`
			`return _get_encoding(encoding).codec_info.encode(input, errors)[0]`


			`def iter_decode(input, fallback_encoding, errors='replace'):`
			`"""`
			`"Pull"-based decoder.`

			`:param input:`
			`An iterable of byte strings.`

			`The input is first consumed just enough to determine the encoding`
			`based on the precense of a BOM,`
			`then consumed on demand when the return value is.`
			`:param fallback_encoding:`
			An :class:`Encoding` object or a label string.
			The encoding to use if :obj:`input` does note have a BOM.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
			`:returns:`
			An ``(output, encoding)`` tuple.
			:obj:`output` is an iterable of Unicode strings,
			:obj:`encoding` is the :obj:`Encoding` that is being used.

			`"""`

			`decoder = IncrementalDecoder(fallback_encoding, errors)`
			`generator = _iter_decode_generator(input, decoder)`
			`encoding = next(generator)`
			`return generator, encoding`


			`def _iter_decode_generator(input, decoder):`
			"""Return a generator that first yields the :obj:`Encoding`,
			`then yields output chukns as Unicode strings.`

			`"""`
			`decode = decoder.decode`
			`input = iter(input)`
			`for chunck in input:`
			`output = decode(chunck)`
			`if output:`
			`assert decoder.encoding is not None`
			`yield decoder.encoding`
			`yield output`
			`break`
			`else:`
			`# Input exhausted without determining the encoding`
			`output = decode(b'', final=True)`
			`assert decoder.encoding is not None`
			`yield decoder.encoding`
			`if output:`
			`yield output`
			`return`

			`for chunck in input:`
			`output = decode(chunck)`
			`if output:`
			`yield output`
			`output = decode(b'', final=True)`
			`if output:`
			`yield output`


			`def iter_encode(input, encoding=UTF8, errors='strict'):`
			`"""`
			`“Pull”-based encoder.`

			`:param input: An iterable of Unicode strings.`
			:param encoding: An :class:`Encoding` object or a label string.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
			`:returns: An iterable of byte strings.`

			`"""`
			# Fail early if `encoding` is an invalid label.
			`encode = IncrementalEncoder(encoding, errors).encode`
			`return _iter_encode_generator(input, encode)`


			`def _iter_encode_generator(input, encode):`
			`for chunck in input:`
			`output = encode(chunck)`
			`if output:`
			`yield output`
			`output = encode('', final=True)`
			`if output:`
			`yield output`


			`class IncrementalDecoder(object):`
			`"""`
			`“Push”-based decoder.`

			`:param fallback_encoding:`
			An :class:`Encoding` object or a label string.
			The encoding to use if :obj:`input` does note have a BOM.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

			`"""`
			`def __init__(self, fallback_encoding, errors='replace'):`
			# Fail early if `encoding` is an invalid label.
			`self._fallback_encoding = _get_encoding(fallback_encoding)`
			`self._errors = errors`
			`self._buffer = b''`
			`self._decoder = None`
			#: The actual :class:`Encoding` that is being used,
			#: or :obj:`None` if that is not determined yet.
			`#: (Ie. if there is not enough input yet to determine`
			`#: if there is a BOM.)`
			`self.encoding = None # Not known yet.`

			`def decode(self, input, final=False):`
			`"""Decode one chunk of the input.`

			`:param input: A byte string.`
			`:param final:`
			`Indicate that no more input is available.`
			Must be :obj:`True` if this is the last call.
			`:returns: An Unicode string.`

			`"""`
			`decoder = self._decoder`
			`if decoder is not None:`
			`return decoder(input, final)`

			`input = self._buffer + input`
			`encoding, input = _detect_bom(input)`
			`if encoding is None:`
			`if len(input) < 3 and not final: # Not enough data yet.`
			`self._buffer = input`
			`return ''`
			`else: # No BOM`
			`encoding = self._fallback_encoding`
			`decoder = encoding.codec_info.incrementaldecoder(self._errors).decode`
			`self._decoder = decoder`
			`self.encoding = encoding`
			`return decoder(input, final)`


			`class IncrementalEncoder(object):`
			`"""`
			`“Push”-based encoder.`

			:param encoding: An :class:`Encoding` object or a label string.
			:param errors: Type of error handling. See :func:`codecs.register`.
			:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.

			`.. method:: encode(input, final=False)`

			`:param input: An Unicode string.`
			`:param final:`
			`Indicate that no more input is available.`
			Must be :obj:`True` if this is the last call.
			`:returns: A byte string.`

			`"""`
			`def __init__(self, encoding=UTF8, errors='strict'):`
			`encoding = _get_encoding(encoding)`
			`self.encode = encoding.codec_info.incrementalencoder(errors).encode`