#!/usr/bin/env python
"""
Unicode case folding database conversion utility
Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:
<code>; <status>; <mapping>; # <name>
<status> can be one of four characters:
C - Common mappings
S - mappings for Simple case folding
F - mappings for Full case folding
T - special case for Turkish I characters
Right now this generates a function which implements simple case folding (C+S
entries).
"""
# This variable will body of the mappings function
=
# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
= -1
=
=
continue
=
=
=
yield , ,
=
# Computes the shift (to_char - from_char) in a mapping.
return -
# Computes the stride (from_char2 - from_char1) of two mappings.
return -
# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
return
# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
global
# Special case for handling blocks of length 1. We don't even need to
# emit the "if (C < X) return C" check below as all characters in this
# range will be caught by the "C < X" check emitted by the first
# non-trivial block.
+=
return
=
= + *
= %
# All characters before this block map to themselves.
+=
+=
# Generic pattern: check upper bound (lower bound is checked by the "if"
# above) and modulo of C, return C+shift.
=
# Special case:
# We can elide the modulo-check because the expression "C|1" will map
# the intervening characters to themselves.
=
== 1:
# Another special case: X % 1 is always zero, so don't emit the
# modulo-check.
=
+=
=
=
continue
# Incompatible shift, start a new block.
=
continue
continue
# Incompatible stride, start a new block.
=