//
================================================================================================================= // xs_UTF8Decode //
================================================================================================================= int32 xs_UTF8Decode (uint32 &code, const xs_utf8* str, int32 len, bool strict) { #define xs_U8(p,sh) ((((xs_utf32)(str[p]))&0x3f)<<(sh)) //bits from each "pair" byte #define xs_UErr8(p) ((p)>=len
|| str[p]==0 || (strict&&(((str[p])&0xC0)!=0x80))) //UTF-8 valid
"pair" byte if (str==0||len==0) {code
= xs_UTF_Replace; return 0;} if ((*str&0x80)==0) {code = str[0]; return 1;} if (xs_UErr8(1)) {code
= xs_UTF_Replace; return strict ? 0 : 1;} //error if ((*str&0xe0)==0xc0) {code = ((str[0]&0x1f)<<6) + xs_U8(1, 0); return
2;} if (xs_UErr8(2)) {code
= xs_UTF_Replace; return strict ? 0 : 2;} //error if ((*str&0xf0)==0xe0) {code = ((str[0]&0x0f)<<12) + xs_U8(1, 6) + xs_U8(2, 0); return
3;} if (xs_UErr8(3)) {code
= xs_UTF_Replace; return strict ? 0 : 3;} //error if ((*str&0xf8)==0xf0) {code = ((str[0]&0x07)<<18) + xs_U8(1, 12) +
xs_U8(2, 6) + xs_U8(3, 0); return 4;} /* //illegal
in Unicode v3.2 if
(xs_UErr8(4)) {code =
xs_UTF_Replace; return 0;} //error if
((*str&0xfc)==0xf8) {code =
((str[0]&0x03)<<24) + xs_U8(1, 18) + xs_U8(2, 12) + xs_U8(3,
6) + xs_U8(4, 0); return 5;} if
(xs_UErr8(5)) {code =
xs_UTF_Replace; return 0;} //error if
((*str&0xfe)==0xfc) {code =
((str[0]&0x01)<<30) + xs_U8(1, 24) + xs_U8(2, 18) + xs_U8(3, 12) + xs_U8(4,
6) + xs_U8(5, 0); return 6;} */ // error code =
xs_UTF_Replace; return strict ? 0 : 1; } //
================================================================================================================= // UTF-8
Character encoding // ================================================================================================================= // // 0x00000000 - 0x0000007F: 0xxxxxxx // 0x00000080 - 0x000007FF: 110xxxxx 10xxxxxx // 0x00000800 - 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx // 0x00010000 - 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 0x00200000 - 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx //illegal in v3.2 // 0x04000000 - 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
10xxxxxx //illegal in v3.2 // //
================================================================================================================= |