# coding: utf-8
"""
webencodings
~~~~~~~~~~~~
This is a Python implementation of the `WHATWG Encoding standard
<http://encoding.spec.whatwg.org/>`. See README for details.
:copyright: Copyright 2012 by Simon Sapin
:license: BSD, see LICENSE for details.
"""
=
# Some names in Encoding are not valid Python aliases. Remap these.
=
=
r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
:param string: An Unicode string.
:returns: A new Unicode string.
This is used for `ASCII case-insensitive
<http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
matching of encoding labels.
The same matching is also used, among other things,
for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
This is different from the :meth:`~py:str.lower` method of Unicode strings
which also affect non-ASCII characters,
sometimes mapping them into the ASCII range:
>>> keyword = u'Bac\N{KELVIN SIGN}ground'
>>> assert keyword.lower() == u'background'
>>> assert ascii_lower(keyword) != keyword.lower()
>>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
"""
# This turns out to be faster than unicode.translate()
return
"""
Look for an encoding by its label.
This is the spec’s `get an encoding
<http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
Supported labels are listed there.
:param label: A string.
:returns:
An :class:`Encoding` object, or :obj:`None` for an unknown label.
"""
# Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
=
=
return None
=
=
# Any python_name value that gets to here should be valid.
=
=
=
return
"""
Accept either an encoding object or label.
:param encoding: An :class:`Encoding` object or a label string.
:returns: An :class:`Encoding` object.
:raises: :exc:`~exceptions.LookupError` for an unknown label.
"""
return
=
return
"""Reresents a character encoding such as UTF-8,
that can be used for decoding or encoding.
.. attribute:: name
Canonical name of the encoding
.. attribute:: codec_info
The actual implementation of the encoding,
a stdlib :class:`~codecs.CodecInfo` object.
See :func:`codecs.register`.
"""
=
=
return %
#: The UTF-8 encoding. Should be used for new content and formats.
=
=
=
"""
Decode a single string.
:param input: A byte string
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return:
A ``(output, encoding)`` tuple of an Unicode string
and an :obj:`Encoding`.
"""
# Fail early if `encoding` is an invalid label.
=
, =
= or
return ,
"""Return (bom_encoding, input), with any BOM removed from the input."""
return ,
return ,
return ,
return None,
"""
Encode a single string.
:param input: An Unicode string.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:return: A byte string.
"""
return
"""
"Pull"-based decoder.
:param input:
An iterable of byte strings.
The input is first consumed just enough to determine the encoding
based on the precense of a BOM,
then consumed on demand when the return value is.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns:
An ``(output, encoding)`` tuple.
:obj:`output` is an iterable of Unicode strings,
:obj:`encoding` is the :obj:`Encoding` that is being used.
"""
=
=
=
return ,
"""Return a generator that first yields the :obj:`Encoding`,
then yields output chukns as Unicode strings.
"""
=
=
=
assert is not None
yield
yield
break
# Input exhausted without determining the encoding
=
assert is not None
yield
yield
return
=
yield
=
yield
"""
“Pull”-based encoder.
:param input: An iterable of Unicode strings.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
:returns: An iterable of byte strings.
"""
# Fail early if `encoding` is an invalid label.
= .
return
=
yield
=
yield
"""
“Push”-based decoder.
:param fallback_encoding:
An :class:`Encoding` object or a label string.
The encoding to use if :obj:`input` does note have a BOM.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
"""
# Fail early if `encoding` is an invalid label.
=
=
= b
= None
#: The actual :class:`Encoding` that is being used,
#: or :obj:`None` if that is not determined yet.
#: (Ie. if there is not enough input yet to determine
#: if there is a BOM.)
= None # Not known yet.
"""Decode one chunk of the input.
:param input: A byte string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: An Unicode string.
"""
=
return
= +
, =
# Not enough data yet.
=
return
# No BOM
=
= .
=
=
return
"""
“Push”-based encoder.
:param encoding: An :class:`Encoding` object or a label string.
:param errors: Type of error handling. See :func:`codecs.register`.
:raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
.. method:: encode(input, final=False)
:param input: An Unicode string.
:param final:
Indicate that no more input is available.
Must be :obj:`True` if this is the last call.
:returns: A byte string.
"""
=
= .