diff options
Diffstat (limited to 'squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp')
-rw-r--r-- | squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp b/squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp new file mode 100644 index 000000000..7505e9928 --- /dev/null +++ b/squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp @@ -0,0 +1,133 @@ + +/////////////////////////////////////////////////////////////////////// +// Simple conversion routines, to, from UTF8 and full Unicode character +// (using int). +// + +// Only when needed +#if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0 + +static char g_utf8_length[256]; +static int g_did_init_length; + +void sqplus_init_utf8_lengths() { + // Fill in lengths in array above + for( int lb=0; lb<256; lb++ ){ + int l = -1; + if( !(lb&0x80) ) l=1; + else if( (lb&0xE0)==0xC0 ) l=2; + else if( (lb&0xF0)==0xE0 ) l=3; + else if( (lb&0xF8)==0xF0 ) l=4; + else if( (lb&0xFC)==0xF8 ) l=5; + else if( (lb&0xFE)==0xFC ) l=6; + g_utf8_length[lb] = l; + } + g_did_init_length = 1; +} + +// Length of a UTF8 encoded Unicode character +int sqplus_utf8_len(int lead_byte){ + if( !(lead_byte&0x80) ) return 1; // Special case, make faster + if( !g_did_init_length ) + sqplus_init_utf8_lengths(); + + return g_utf8_length[(unsigned char)lead_byte]; +} + +int sqplus_utf8_len_first(const char* pc){ + int lb = *(unsigned char*)pc; + if( !(lb&0x80) ) return 1; // Special case, make faster + if( !g_did_init_length ) + sqplus_init_utf8_lengths(); + + int l = g_utf8_length[(unsigned char)lb]; + if( l>0 ) return l; + + // Invalid UTF8 lead byte. Look for next valid character. + const char *pc1 = pc+1; + while( ((*pc1)&0xC0)==0x80 ) + pc1++; + return int(pc1 - pc); +} + + +// Length of a UTF8 encoded Unicode string (number of Unicode characters) +int sqplus_utf8_strlen(const char *str) { + if( !str ) return 0; + int l, tl=0; + while( *str ){ + l = sqplus_utf8_len_first(str); + str += l; + tl++; + } + return tl; +} + +// Convert one UTF8 encoded character to Unicode point +int sqplus_utf8_to_wchar(int *result, const char *string){ + int res=-1; + + // Assume argument pointers to be OK + unsigned char ch = *string; + int l = sqplus_utf8_len(ch); + + if( l<1 ) return -1; + int wc = l>1 ? (ch&(0x7F>>l)) : ch; + while( l>1 ){ + wc = (wc<<6) + (*++string & 0x3F); + l--; + } + *result = wc; + + return 0; +} + +// Convert one Unicode point to UTF8 encoded version. +// Checks if output fits in 1/4/6 bytes buffer. +int sqplus_wchar_to_utf8(char *s, int wc, int size){ + if( size<1 ) return -1; + if( (unsigned int)wc>=0x80000000 ) return -2; + + // Single byte case + if( wc<0x80 ){ + *s = (char)wc; + //*s = (char)wc&0x7F; + return 1; + } + if( size<4 ) return -3; + + // Two or more UTF8 bytes + int p = 1; // Index of last UTF8 byte + if( wc>0x7FF ){ // 11 bits + // Three or more UTF8 bytes + p++; // p>=2 + if( wc>0xFFFF ){ // 16 bits + // Four or more UTF8 bytes + p++; // p>=3 + if( wc>0x1FFFFF ){ // 21 bits + // Five or more UTF8 bytes + if( size<6 ) return -3; + p++; // p>=4 UTF8 bytes + if( wc>0x3FFFFFF ){ // 26 bits + // Six UTF8 bytes + p++; // p>=5 + if( (unsigned int)wc>(unsigned int)0x7FFFFFF ){ // 31 bits + // Would need 7 UTF8 bytes. Not supported. + return -10; + } + s[p-4] = 0x80 | ((wc>>24)&0x3F); // Bit 24..29 + } + s[p-3] = 0x80 | ((wc>>18)&0x3F); // Bit 18..23 + } + s[p-2] = 0x80 | ((wc>>12)&0x3F); // Bit 12..17 + } + s[p-1] = 0x80 | ((wc>>6)&0x3F); // Bit 6..11 + } + s[p] = 0x80 | (wc&0x3F); // Bit 0..5 + s[0] = (0xFC << (5-p)) | (wc>>(p*6)); + + return p+1; +} + +#endif // #if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0 + |