/** * @file * $Id$ * $Revision$ * $Author$ * $Date$ * * This file is part of The iWear Framework. * * The iWear Framework is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the * Free Software Foundation as in version 2 of the License. * * The iWear Framework is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * The iWear Framework; if not, write to the Free Software Foundation, Inc., 59 * Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __IWEAR_UNICODE_H #define __IWEAR_UNICODE_H /** * @file This file contains some functions for unicode/utf8 support */ /** * This converts the current character sequence from utf8 to a ucs 32Bit * representation. The number of characters used for it is put into the int * passed. * Conversion is done according to ISO 10646, @see * http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html */ inline int32_t utf8_2_ucs( const char* p, int* n) { int N; if( ! n ) n = &N; // To make it easy to not write if n is 0 so we dont need to check always unsigned char z = *p; if( z <= 0x7F ) { *n = 1; return z; } else if( z >= 0xC0 && z<= 0xDF ) { unsigned char y = p[1]; if( y == '\0' ) { *n = 0; return 0; } int32_t r = (z-0xC0) * (1<<6) + (y-0x80); *n = 2; return r; } else if( z >= 0xE0 && z<= 0xEF ) { unsigned char y = p[1]; if( y == '\0' ) { *n = 0; return 0; } unsigned char x = p[2]; if( x == '\0' ) { *n = 0; return 0; } int32_t r = (z-0xE0)*(1<<12) + (y-0x80)*(1<<6) + (x-0x80); *n = 3; return r; } else if( z >= 0xF0 && z<= 0xF7 ) { unsigned char y = p[1]; if( y == '\0' ) { *n = 0; return 0; } unsigned char x = p[2]; if( x == '\0' ) { *n = 0; return 0; } unsigned char w = p[3]; if( w == '\0' ) { *n = 0; return 0; } int32_t r = (z-0xF0)*(1<<18) + (y-0x80)*(1<<12) +(x-0x80)*(1<<6) + (w-0x80); *n = 4; return r; } else if( z >= 0xF8 && z<= 0xFB ) { unsigned char y = p[1]; if( y == '\0' ) { *n = 0; return 0; } unsigned char x = p[2]; if( x == '\0' ) { *n = 0; return 0; } unsigned char w = p[3]; if( w == '\0' ) { *n = 0; return 0; } unsigned char v = p[4]; if( v == '\0' ) { *n = 0; return 0; } int32_t r = (z-0xF8)*(1<<24) + (y-0x80)*(1<<18) +(x-0x80)*(1<<12) + (w-0x80)*(1<<6) + (v - 0x80); *n = 5; return r; } else if( z >= 0xFC && z<= 0xFD ) { unsigned char y = p[1]; if( y == '\0' ) { *n = 0; return 0; } unsigned char x = p[2]; if( x == '\0' ) { *n = 0; return 0; } unsigned char w = p[3]; if( w == '\0' ) { *n = 0; return 0; } unsigned char v = p[4]; if( v == '\0' ) { *n = 0; return 0; } unsigned char u = p[5]; if( u == '\0' ) { *n = 0; return 0; } int32_t r = (z-0xFC)*(1<<30) + (y-0x80)*(1<<24) +(x-0x80)*(1<<18) + (w-0x80)*(1<<12) + (v-0x80)*(1<<6) + (u-0x80); *n = 6; return r; } else { *n = 0; return 0; } } /** * @return <0 if the character is not an alpha or >0 if it is. The absolute * value of the return denotes the number of bytes in the multibyte character. * It returns 0 if the character sequence is invalid multibyte and therefore * could not be properly tested. */ inline int isalpha_utf8( const char* p ) { int n; // int32_t ucs = utf8_2_ucs( p, &n ); (void) utf8_2_ucs( p, &n ); if( n ) { // It is multibyte ucs, but the isalpha() seems to support only strange // other representation... unsigned int mb = 0; uint32_t bp = 1; for( int i = 0; i < n; ++ i ) { mb |= ( bp * static_cast( static_cast(p[n-i-1]))); bp *= 256; } if( isalpha(mb) ) { return n; } else { return -n; } } else { return 0; } } #endif