[cffi-devel] A question about C and Unicode

23 Mar 2011

      Hi folks!

I am playing around with a C library and came across
some issues related to unicode in C. In the unicode enabled
version of this C library which implements UCS-2 (i.e. just BMP)
the "unicode character type" is defined in the header file
as follows:
======================================
/*
  * ZAB_CHAR
  * ZAB_UC
  */

#ifdef ZABonAIX
   #if defined(_AIX51) && defined(ZABwith64_BIT)
     #define ZABonAIX_wchar_is_4B
   #else
     #define ZABonAIX_wchar_is_2B
   #endif
#endif

#ifdef ZABonAIX
   #if defined(_AIX51) && defined(ZABwith64_BIT)
     #define ZABonAIX_wchar_is_4B
   #elif defined(ZABccQ)
     #define ZABonAIX_wchar_is_4B
   #else
     #define ZABonAIX_wchar_is_2B
   #endif
#endif

#if defined(ZABonNT)      || \
     defined(ZABonOS400)   || \
    (defined(ZABonOS390) && !defined(_LP64))  || \
     defined(ZABonAIX) && defined(ZABonAIX_wchar_is_2B)
   #define WCHAR_is_2B
#else
   #define WCHAR_is_4B
#endif

#if defined(ZABonLIN) && defined(GCC_UTF16_PATCH)
   #if __GNUC_PREREQ (4,3)
     #include <uchar.h>
     #define ZAB_UC_is_char16
   #endif
#endif

#ifndef ZABwithUNICODE
   #define ZAB_UC_is_1B
   typedef char ZAB_CHAR;
   typedef char ZAB_UC;
#else  /* ZABwithUNICODE */
   #if defined(WCHAR_is_2B)
     #define ZAB_UC_is_wchar
     typedef wchar_t ZAB_CHAR;
     typedef wchar_t ZAB_UC;
   #elif defined(ZAB_UC_is_char16)
     typedef char16_t ZAB_CHAR;
     typedef char16_t ZAB_UC;
   #else
     #define ZAB_UC_is_UTF16_without_wchar
     typedef unsigned short ZAB_CHAR;
     typedef unsigned short ZAB_UC;
   #endif
#endif /* ZABwithUNICODE or not */

/*
  * CFRSDKwith(out)UTF16_LITERALS
  * for CFR SDK applications: controls use of UTF-16
  * literal enabled compilers.
  */
#if defined(CFRSDKwithUTF16_LITERALS)
#elif defined(CFRSDKwithoutUTF16_LITERALS)
   #define ZABwithoutUTF16_LITERALS
#elif defined(WCHAR_is_2B) || \
     defined(ZABonHP_UX) || \
     (defined(ZABonLIN) && defined(__i386__) && defined(__GNUC__) && 
(__GNUC__<3)) || \
     (defined(ZABonLIN) && defined(GCC_UTF16_PATCH)) || \
     defined(ZABonSUN) || defined(ZABonAIX)
   /* we have literals for UTF-16 */
#else
   #define ZABwithoutUTF16_LITERALS
#endif
======================================

All this boils down to

+---------------------------+

+------>-| typedef wchar_t ZAB_CHAR; |

| | typedef wchar_t ZAB_UC; |

| +---------------------------+

|

+---------+ | +----------------------------+

| Unicode |------+------>-| typedef char16_t ZAB_CHAR; |

+---------+ | | typedef char16_t ZAB_UC; |

| +----------------------------+

|

| +----------------------------------+

+------>-| typedef unsigned short ZAB_CHAR; |

| typedef unsigned short ZAB_UC; |

+----------------------------------+

The question is now: Is it correct (resp. safe)
to /*defctype*/ ZAB_UC just as :uint16 and interpret
it as a UFT-16 code point (with an appropriate endiannness)?

How can the types like wchar_t and char_16_t be defined (resp. used) in 
CFFI?

Regards
Nik

nitralime

Luís Oliveira

tags

participants (2)