(Unicode) UTF-8与UTF-16之间转换


比如:刚开始的时候就有ASCII字符集(American Standard Code for Information Interchange, 美国信息交换标准码)它使用7 bits来表示一个字符,
 2、英文字母再加一些其他标点字符之类的也不会超过256个,一个字节表示足够了。但其他一些文字不止这么多 ,像汉字就上万个,
为了适应全球化的发展,便于不同语言之间的兼容交互,而ASCII不再能胜任此任务了。所以就出现了Unicode和ISO这样的组织来统一制定一个标准,任何一个字符只对应一个确定的数字。ISO取的名字叫UCS(Universal Character Set)(ucs-2对应utf-16,ucs-4对应utf-32),Unicode取的名字就叫unicode了。
 1、Unicode第一个版本涉及到两个步骤:首先定义一个规范,给所有的字符指定一个唯一对应的数字,Unicode是用0至65535(2的16次方)之间的数字来表示所有字符,其中0至127这128个数字表示的字符仍然跟ASCII完全一样;第二怎么把字符对应的数字(0至65535)转化成01串保保存在计算机中。在保存时就涉及到了在计算机中占多少字节空间,就有不同的保存方式,于是出现了UTF(unicode transformation format):UTF-8和UTF-16。

 2、UTF-16表示"汉":比较简单,就是01101100   01001001(共16 bit,两个字节),程序解析的时候知道是UTF-16就把两个字节当成一个单元来解析。
   110xxxxx  10xxxxxx:如果是这样的格式,则把两个字节当一个单元;
   1110xxxx 10xxxxxx 10xxxxxx:如果是这样的格式,则把三个字节当一个单元。
 6、由于"汉"的编码27721大于2048了所有两个字节还不够,所以用1110xxxx 10xxxxxx 10xxxxxx这种格式,把27721对应的二进制从左到右填充XXX符号(实际上不一定从左到右,也可以从右到左)。




#include <stdio.h>
#include <string.h>
#include "utf.h"
static boolean isLegalUTF8(const UTF8 *source, int length)
    UTF8 a;
    const UTF8 *srcptr = NULL;
    if (NULL == source){
        printf("ERR, isLegalUTF8: source=%p\n", source);
        return FALSE;
    srcptr = source+length;

    switch (length) {
			printf("ERR, isLegalUTF8 1: length=%d\n", length);
			return FALSE;
		/* Everything else falls through when "TRUE"... */
		case 4:
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF){
				printf("ERR, isLegalUTF8 2: length=%d, a=%x\n", length, a);
				return FALSE;
		case 3:
			if ((a = (*--srcptr)) < 0x80 || a > 0xBF){
				printf("ERR, isLegalUTF8 3: length=%d, a=%x\n", length, a);
				return FALSE;
		case 2: 
			if ((a = (*--srcptr)) > 0xBF){
				printf("ERR, isLegalUTF8 4: length=%d, a=%x\n", length, a);
				return FALSE;
			switch (*source)
				/* no fall-through in this inner switch */
				case 0xE0: 
					if (a < 0xA0){
						printf("ERR, isLegalUTF8 1: source=%x, a=%x\n", *source, a);
						return FALSE; 
				case 0xED:
					if (a > 0x9F){
						printf("ERR, isLegalUTF8 2: source=%x, a=%x\n", *source, a);
						return FALSE; 
				case 0xF0:
					if (a < 0x90){
						printf("ERR, isLegalUTF8 3: source=%x, a=%x\n", *source, a);
						return FALSE; 
				case 0xF4:
					if (a > 0x8F){
						printf("ERR, isLegalUTF8 4: source=%x, a=%x\n", *source, a);
						return FALSE; 
					if (a < 0x80){
						printf("ERR, isLegalUTF8 5: source=%x, a=%x\n", *source, a);
						return FALSE; 
		case 1: 
			if (*source >= 0x80 && *source < 0xC2){
				printf("ERR, isLegalUTF8: source=%x\n", *source);
				return FALSE;
    if (*source > 0xF4)
		return FALSE;
    return TRUE;
ConversionResult Utf8_To_Utf16 (const UTF8* sourceStart, UTF16* targetStart, size_t outLen , ConversionFlags flags)
    ConversionResult result = conversionOK;
    const UTF8* source = sourceStart;
    UTF16* target      = targetStart;
    UTF16* targetEnd   = targetStart + outLen/2;
    const UTF8*  sourceEnd = NULL;

    if ((NULL == source) || (NULL == targetStart)){
        printf("ERR, Utf8_To_Utf16: source=%p, targetStart=%p\n", source, targetStart);
        return conversionFailed;
    sourceEnd   = strlen((const char*)sourceStart) + sourceStart;

    while (*source){
        UTF32 ch = 0;
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
        if (source + extraBytesToRead >= sourceEnd){
            printf("ERR, Utf8_To_Utf16----sourceExhausted: source=%p, extraBytesToRead=%d, sourceEnd=%p\n", source, extraBytesToRead, sourceEnd);
            result = sourceExhausted;
        /* Do this check whether lenient or strict */
        if (! isLegalUTF8(source, extraBytesToRead+1)){
            printf("ERR, Utf8_To_Utf16----isLegalUTF8 return FALSE: source=%p, extraBytesToRead=%d\n", source, extraBytesToRead);
            result = sourceIllegal;
        * The cases all fall through. See "Note A" below.
        switch (extraBytesToRead) {
			case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
			case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
			case 3: ch += *source++; ch <<= 6;
			case 2: ch += *source++; ch <<= 6;
			case 1: ch += *source++; ch <<= 6;
			case 0: ch += *source++;
        ch -= offsetsFromUTF8[extraBytesToRead];

        if (target >= targetEnd) {
            source -= (extraBytesToRead+1); /* Back up source pointer! */
            printf("ERR, Utf8_To_Utf16----target >= targetEnd: source=%p, extraBytesToRead=%d\n", source, extraBytesToRead);
            result = targetExhausted;
        if (ch <= UNI_MAX_BMP){
			/* Target is a character <= 0xFFFF */
            /* UTF-16 surrogate values are illegal in UTF-32 */
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END){
                if (flags == strictConversion){
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
                    printf("ERR, Utf8_To_Utf16----ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END: source=%p, extraBytesToRead=%d\n", source, extraBytesToRead);
                    result = sourceIllegal;
                } else {
                    *target++ = UNI_REPLACEMENT_CHAR;
            } else{
                *target++ = (UTF16)ch; /* normal case */
        }else if (ch > UNI_MAX_UTF16){
            if (flags == strictConversion) {
                result = sourceIllegal;
                source -= (extraBytesToRead+1); /* return to the start */
                printf("ERR, Utf8_To_Utf16----ch > UNI_MAX_UTF16: source=%p, extraBytesToRead=%d\n", source, extraBytesToRead);
                break; /* Bail out; shouldn't continue */
            } else {
                *target++ = UNI_REPLACEMENT_CHAR;
        } else {
            /* target is a character in range 0xFFFF - 0x10FFFF. */
            if (target + 1 >= targetEnd) {
                source -= (extraBytesToRead+1); /* Back up source pointer! */
                printf("ERR, Utf8_To_Utf16----target + 1 >= targetEnd: source=%p, extraBytesToRead=%d\n", source, extraBytesToRead);
                result = targetExhausted; break;
            ch -= halfBase;
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    return result;

int Utf16_To_Utf8 (const UTF16* sourceStart, UTF8* targetStart, size_t outLen ,  ConversionFlags flags)
    int result = 0;
    const UTF16* source = sourceStart;
    UTF8* target        = targetStart;
    UTF8* targetEnd     = targetStart + outLen;
    if ((NULL == source) || (NULL == targetStart)){
        printf("ERR, Utf16_To_Utf8: source=%p, targetStart=%p\n", source, targetStart);
        return conversionFailed;
    while ( *source ) {
        UTF32 ch;
        unsigned short bytesToWrite = 0;
        const UTF32 byteMask = 0xBF;
        const UTF32 byteMark = 0x80; 
        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
        ch = *source++;
        /* If we have a surrogate pair, convert to UTF32 first. */
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
            /* If the 16 bits following the high surrogate are in the source buffer... */
            if ( *source ){
                UTF32 ch2 = *source;
                /* If it's a low surrogate, convert to UTF32. */
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase;
                }else if (flags == strictConversion) { /* it's an unpaired high surrogate */
                    --source; /* return to the illegal value itself */
                    result = sourceIllegal;
            } else { /* We don't have the 16 bits following the high surrogate. */
                --source; /* return to the high surrogate */
                result = sourceExhausted;
        } else if (flags == strictConversion) {
            /* UTF-16 surrogate values are illegal in UTF-32 */
            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END){
                --source; /* return to the illegal value itself */
                result = sourceIllegal;
        /* Figure out how many bytes the result will require */
        if(ch < (UTF32)0x80){	     
			bytesToWrite = 1;
        } else if (ch < (UTF32)0x800) {     
            bytesToWrite = 2;
        } else if (ch < (UTF32)0x10000) {  
            bytesToWrite = 3;
        } else if (ch < (UTF32)0x110000){ 
            bytesToWrite = 4;
        } else {	
            bytesToWrite = 3;
            ch = UNI_REPLACEMENT_CHAR;
        target += bytesToWrite;
        if (target > targetEnd) {
            source = oldSource; /* Back up source pointer! */
            target -= bytesToWrite; result = targetExhausted; break;
        switch (bytesToWrite) { /* note: everything falls through. */
			case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
			case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
			case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
			case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
        target += bytesToWrite;
    return result;
int main(int argc, char *argv[])
	int i=0;
	UTF8 buf8[256]="";
	UTF16 buf16[256]={0};
	printf("\nUTF-8 => UTF-16 = ");
		printf("%#x  ",buf16[i]);

	Utf16_To_Utf8 (buf16, buf8, sizeof(buf8) , strictConversion);
	printf("\nUTF-16 => UTF-8 = %s\n\n",buf8);
	return 0;



#ifndef __UTF_H__
#define __UTF_H__

#define FALSE  0
#define TRUE   1

#define halfShift	10
#define UNI_SUR_HIGH_START  (UTF32)0xD800
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
#define UNI_SUR_LOW_START   (UTF32)0xDC00
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
/* Some fundamental constants */
#define UNI_MAX_BMP (UTF32)0x0000FFFF
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF

typedef unsigned char   boolean;
typedef unsigned int	CharType ;
typedef unsigned char	UTF8;
typedef unsigned short	UTF16;
typedef unsigned int	UTF32;

static const UTF32 halfMask = 0x3FFUL;
static const UTF32 halfBase = 0x0010000UL;
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
static const char trailingBytesForUTF8[256] =
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
typedef enum 
	strictConversion = 0,
} ConversionFlags;
typedef enum 
	conversionOK, 		/* conversion successful */
	sourceExhausted,	/* partial character in source, but hit end */
	targetExhausted,	/* insuff. room in target for conversion */
	sourceIllegal,		/* source sequence is illegal/malformed */
} ConversionResult;








