######################## BEGIN LICENSE BLOCK ########################
#
# Contributor(s):
# Jason Zavaglia
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
"""
This class simply looks for occurrences of zero bytes, and infers
whether the file is UTF16 or UTF32 (low-endian or big-endian)
For instance, files looking like ( \0 \0 \0 [nonzero] )+
have a good probability to be UTF32BE. Files looking like ( \0 [nonzero] )+
may be guessed to be UTF16BE, and inversely for little-endian varieties.
"""
# how many logical characters to scan before feeling confident of prediction
= 20
# a fixed constant ratio of expected zeros or non-zeros in modulo-position.
= 0.94
= 0
= * 4
= * 4
=
=
= False
= False
= False
= False
= False
= False
= 0
= * 4
= * 4
=
= False
= False
= False
= False
= False
= False
=
return
return
return
return
# default to something valid
return
return
return
return
=
return >= and
=
return >= and
=
return >= and
=
return >= and
"""
Validate if the quad of bytes is valid UTF-32.
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
excluding 0x0000D800 - 0x0000DFFF
https://en.wikipedia.org/wiki/UTF-32
"""
= True
= True
"""
Validate if the pair of bytes is valid UTF-16.
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
with an exception for surrogate pairs, which must be in the range
0xD800-0xDBFF followed by 0xDC00-0xDFFF
https://en.wikipedia.org/wiki/UTF-16
"""
= True
0xDC <= <= 0xDF:
= True
= False
= True
= True
0xDC <= <= 0xDF:
= True
= False
= True
= % 4
=
+= 1
+= 1
+= 1
return
# terminal, decided states
return
=
> 4 * 1024:
# if we get to 4kb into the file, and we can't conclude it's UTF,
# let's give up
=
return
return