from __future__ import unicode_literals
import codecs
from .labels import LABELS
VERSION = '0.5.1'
PYTHON_NAMES = {
'iso-8859-8-i': 'iso-8859-8',
'x-mac-cyrillic': 'mac-cyrillic',
'macintosh': 'mac-roman',
'windows-874': 'cp874'}
CACHE = {}
def ascii_lower(string):
return string.encode('utf8').lower().decode('utf8')
def lookup(label):
label = ascii_lower(label.strip('\t\n\f\r '))
name = LABELS.get(label)
if name is None:
return None
encoding = CACHE.get(name)
if encoding is None:
if name == 'x-user-defined':
from .x_user_defined import codec_info
else:
python_name = PYTHON_NAMES.get(name, name)
codec_info = codecs.lookup(python_name)
encoding = Encoding(name, codec_info)
CACHE[name] = encoding
return encoding
def _get_encoding(encoding_or_label):
if hasattr(encoding_or_label, 'codec_info'):
return encoding_or_label
encoding = lookup(encoding_or_label)
if encoding is None:
raise LookupError('Unknown encoding label: %r' % encoding_or_label)
return encoding
class Encoding(object):
def __init__(self, name, codec_info):
self.name = name
self.codec_info = codec_info
def __repr__(self):
return '<Encoding %s>' % self.name
UTF8 = lookup('utf-8')
_UTF16LE = lookup('utf-16le')
_UTF16BE = lookup('utf-16be')
def decode(input, fallback_encoding, errors='replace'):
fallback_encoding = _get_encoding(fallback_encoding)
bom_encoding, input = _detect_bom(input)
encoding = bom_encoding or fallback_encoding
return encoding.codec_info.decode(input, errors)[0], encoding
def _detect_bom(input):
if input.startswith(b'\xFF\xFE'):
return _UTF16LE, input[2:]
if input.startswith(b'\xFE\xFF'):
return _UTF16BE, input[2:]
if input.startswith(b'\xEF\xBB\xBF'):
return UTF8, input[3:]
return None, input
def encode(input, encoding=UTF8, errors='strict'):
return _get_encoding(encoding).codec_info.encode(input, errors)[0]
def iter_decode(input, fallback_encoding, errors='replace'):
decoder = IncrementalDecoder(fallback_encoding, errors)
generator = _iter_decode_generator(input, decoder)
encoding = next(generator)
return generator, encoding
def _iter_decode_generator(input, decoder):
decode = decoder.decode
input = iter(input)
for chunck in input:
output = decode(chunck)
if output:
assert decoder.encoding is not None
yield decoder.encoding
yield output
break
else:
output = decode(b'', final=True)
assert decoder.encoding is not None
yield decoder.encoding
if output:
yield output
return
for chunck in input:
output = decode(chunck)
if output:
yield output
output = decode(b'', final=True)
if output:
yield output
def iter_encode(input, encoding=UTF8, errors='strict'):
encode = IncrementalEncoder(encoding, errors).encode
return _iter_encode_generator(input, encode)
def _iter_encode_generator(input, encode):
for chunck in input:
output = encode(chunck)
if output:
yield output
output = encode('', final=True)
if output:
yield output
class IncrementalDecoder(object):
def __init__(self, fallback_encoding, errors='replace'):
self._fallback_encoding = _get_encoding(fallback_encoding)
self._errors = errors
self._buffer = b''
self._decoder = None
self.encoding = None
def decode(self, input, final=False):
decoder = self._decoder
if decoder is not None:
return decoder(input, final)
input = self._buffer + input
encoding, input = _detect_bom(input)
if encoding is None:
if len(input) < 3 and not final: self._buffer = input
return ''
else: encoding = self._fallback_encoding
decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
self._decoder = decoder
self.encoding = encoding
return decoder(input, final)
class IncrementalEncoder(object):
def __init__(self, encoding=UTF8, errors='strict'):
encoding = _get_encoding(encoding)
self.encode = encoding.codec_info.incrementalencoder(errors).encode