ANSI UTF8 WCHAR String Convert
ANSI UTF-8 wchar_t (WCHAR) 之間的轉換
Table of Contents
Method
In Windows
- 使用 MultiByteToWideChar 與 WideCharToMultiByte 在 ANSI 與 WCHAR 或 UTF-8 與 WCHAR之間的轉換
- Unicode Functions
In linux
In Stand C
- 使用 setlocal 配合 mbstowcs 和 wcstombs
- 在Linux上與Windows上 CodePage的語法不一樣
- Windows上 setlocale
- locale :: "lang[_country_region[.code_page]]" | ".code_page" | "" | NULL
- Language Strings Country/Region Strings
- ".950" //OK
- "chinese_taiwan" //OK
- "chinese_taiwan.950" //OK
- "UTF-8" //Failed
- ".65001" //Failed
- "chinese_taiwan.UTF-8" //Failed
- "chinese_taiwan.65001" //Failed
- "zh_TW" //Failed
- "zh_TW.UTF-8" //Failed
- "UTF16-LE" //Failed
- locale :: "lang[_country_region[.code_page]]" | ".code_page" | "" | NULL
- Linux setlocal
- language[_territory][.codeset][@modifier]
- [http://www.ualberta.ca/dept/chemeng/AIX-43/share/man/info/C/a_doc_lib/aixbman/baseadmn/locale.htm]
- ".950" //Failed
- "chinese_taiwan" //Failed
- "chinese_taiwan.950" //Failed
- "UTF-8" //Failed
- ".65001" //Failed
- "chinese_taiwan.UTF-8" //Failed
- "chinese_taiwan.65001" //Failed
- "zh_TW" //OK
- "zh_TW.UTF-8" //OK
- "UTF16-LE" //Failed
- language[_territory][.codeset][@modifier]
- Windows上 setlocale
- 在Linux上與Windows上 CodePage的語法不一樣
其他選擇
- 使用 UTF-8 CPP
- 跨平台
- only UTF-8 UTF-16 UTF-32
使用 UTF-8 CPP + setlocal 配合 mbstowcs 和 wcstombs
- 以下Code有幾個問題
- 不確定是否在GCC上wchar_t皆為4個byte的UCS-4 而在VCC上皆為2個Byte的UCS-2
#include <stdio.h>
#include <wchar.h>
#include <string.h>
#include <locale.h>
#include <fstream>
#include <iostream>
#include <string>
using namespace std;
#include "utf8.h"
static const char g_hex[] = {'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'};
static const char BOM_UTF8[] = {0xEF,0xBB,0xBF};
static const char BOM_UTF16BF[] = {0xFE,0xFF};
static const char BOM_UTF16LE[] = {0xFF,0xFE};
static const char BOM_UTF32BE[] = {0x00,0x00,0xFE,0xFF};
static const char BOM_UTF32LE[] = {0xFF,0xFE,0x00,0x00};
#ifdef _MSC_VER
static const char CHINESE_TAIWAN[] = "chinese_taiwan"; //OK
static const char CHINESE_TAIWAN_UTF8[] = "chinese_taiwan.UTF-8"; //Failed
static const char CHINESE_TAIWAN_UTF16[] = "chinese_taiwan.UTF-16"; //Failed
#define UTF8TOWCHAR utf8to16
#define WCHARTOUTF8 utf16to8
#else
static const char CHINESE_TAIWAN[] = "zh_TW";
static const char CHINESE_TAIWAN_UTF8[] = "zh_TW.UTF-8"; //OK
static const char CHINESE_TAIWAN_UTF16[] = "zh_TW.UTF-16"; //Failed
#define UTF8TOWCHAR utf8to32
#define WCHARTOUTF8 utf32to8
#endif
int AnsiToUTF8(char* out, int outsize, const char* in, int insize, const char* codepage)
{
if(outsize > 0 && insize > 0 && out && in && setlocale(LC_ALL,codepage))
{
wchar_t* wcstmp = new wchar_t[insize/2 + 1];
mbstowcs(wcstmp, in, insize/2+1);
char* end_it = utf8::unchecked::WCHARTOUTF8(wcstmp, wcstmp + wcslen(wcstmp) + 1 , out);
delete [] wcstmp;
int size = end_it - out - 1;
if(size < outsize-1){out[size]=0;out[size+1]=0;}
return size;
}
return 0;
}
int UTF8ToAnsi(char* out, int outsize, const char* in, int insize, const char* codepage)
{
if(outsize > 0 && insize > 0 && out && in && setlocale(LC_ALL,codepage))
{
wchar_t* wcstmp = new wchar_t[insize/2 + 1];
wchar_t* end_it = utf8::unchecked::UTF8TOWCHAR(in, in + strlen(in) + 1 , wcstmp);
int size = wcstombs(out, wcstmp, outsize);
delete [] wcstmp;
if(size < outsize-1){out[size]=0;out[size+1]=0;}
return size;
}
return 0;
}
int AnsiToWCHAR(wchar_t* out, int outsize, const char* in, int insize, const char* codepage)
{
if(outsize > 0 && insize > 0 && out && in && setlocale(LC_ALL,codepage))
{
int size = mbstowcs(out,in,outsize);
if(size < outsize - 1){out[size]=0;out[size+1]=0;}
return size;
}
return 0;
}
int WCHARToAnsi(char* out, int outsize, const wchar_t* in, int insize, const char* codepage)
{
if(outsize > 0 && insize > 0 && out && in && setlocale(LC_ALL,codepage))
{
int size = wcstombs(out, in, outsize);
if(size < outsize - 1){out[size]=0;out[size+1]=0;}
return size;
}
return 0;
}
void buftostr(unsigned char* buf, unsigned short size, char* str, unsigned short len)
{
unsigned char c,l,r;
unsigned short i,j;
if(size==0||len<3){*str=0;return;}
for(i=0,j=0; i<size&&j<len-3; ++i)
{
c = *(buf+i);
l = c>>4;
r = c - (l<<4);
*(str+j++) = g_hex[l];
*(str+j++) = g_hex[r];
//*(str+j++) = ' ';
}
*(str+j) = 0;
}
int main(int argc, char* argv[])
{
cout<<"CHAR SIZE:"<<sizeof(char)<<" WCHAR SIZE:"<<sizeof(wchar_t)<<endl;
wchar_t wt1[256],wt2[256],wt3[256],wt4[256];
char t1[512],t2[512],t3[512],t4[512],t5[512],t6[512],t7[512],t8[512],t9[512],ta[512],tb[512],tc[512],td[512],te[512],tf[512];
fstream fsIB5,fsIU8,fsIU2;
fsIB5.rdbuf()->open("b5.txt",ios::in|ios::binary);
fsIU8.rdbuf()->open("u8.txt",ios::in|ios::binary);
fsIU2.rdbuf()->open("u2.txt",ios::in|ios::binary);
if (fsIB5.fail() || !fsIB5.good() || !fsIB5.rdbuf()->is_open()) {
cout << "Could not open b.txt" << endl;
return 0;
}
if (fsIU8.fail() || !fsIU8.good() || !fsIU8.rdbuf()->is_open()) {
cout << "Could not open u.txt" << endl;
return 0;
}
if (fsIU2.fail() || !fsIU2.good() || !fsIU2.rdbuf()->is_open()) {
cout << "Could not open u2.txt" << endl;
return 0;
}
int size1 = fsIB5.rdbuf()->sgetn(t1,508);
int size2 = fsIU8.rdbuf()->sgetn(t2,508);
int size3 = fsIU2.rdbuf()->sgetn(t3,508);
t1[size1] = 0;
t1[size1+1] = 0;
t2[size2] = 0;
t3[size3] = 0;
if( memcmp(t2,BOM_UTF8,sizeof(BOM_UTF8)) == 0)
{
fsIU8.rdbuf()->pubseekoff(sizeof(BOM_UTF8),ios_base::beg);
size2 = fsIU8.rdbuf()->sgetn(t2,508);
t2[size2] = 0;
t2[size2+1] = 0;
t2[size2+2] = 0;
t2[size2+3] = 0;
}
if( memcmp(t3,BOM_UTF16LE,sizeof(BOM_UTF16LE)) == 0)
{
fsIU2.rdbuf()->pubseekoff(sizeof(BOM_UTF16LE),ios_base::beg);
size3 = fsIU2.rdbuf()->sgetn(t3,508);
t3[size3] = 0;
t3[size3+1] = 0;
t3[size3+2] = 0;
t3[size3+3] = 0;
}
buftostr((unsigned char*)t1,size1,t4,512);
buftostr((unsigned char*)t2,size2,t5,512);
buftostr((unsigned char*)t3,size3,t6,512);
cout<<"Big5("<<size1<<"):"<<t4<<endl;
cout<<"UTF8("<<size2<<"):"<<t5<<endl;
cout<<"UTF16("<<size3<<"):"<<t6<<endl;
char* tloc;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN);
int size4 = mbstowcs(wt1,t1,256);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 950(Chinese Taiwan Big5) Error!"<<endl;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN_UTF8);
int size5 = mbstowcs(wt2,t2,256);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 65001(UTF-8) Error!"<<endl;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN_UTF16);
int size6 = mbstowcs(wt3,t3,256);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 1200(UTF-16) Error!"<<endl;
buftostr((unsigned char*)wt1,wcslen(wt1)*sizeof(wchar_t),t7,512);
buftostr((unsigned char*)wt2,wcslen(wt2)*sizeof(wchar_t),t8,512);
buftostr((unsigned char*)wt3,wcslen(wt3)*sizeof(wchar_t),t9,512);
cout<<"Big5 to WCHAR("<<size4<<"):"<<t7<<endl;
cout<<"UTF8 to WCHAR("<<size5<<"):"<<t8<<endl;
cout<<"UTF16 to WCHAR("<<size6<<"):"<<t9<<endl;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN);
int size7 = wcstombs(ta,wt1,512);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 950(Chinese Taiwan Big5) Error!"<<endl;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN_UTF8);
int size8 = wcstombs(tb,wt2,512);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 65001(UTF-8) Error!"<<endl;
tloc = setlocale(LC_ALL,CHINESE_TAIWAN_UTF16);
int size9 = wcstombs(tc,wt3,512);
if(tloc != 0)
cout<<tloc<<endl;
else
cout<<"set locale 1200(UTF-16) Error!"<<endl;
buftostr((unsigned char*)ta,strlen(ta)*sizeof(char),td,512);
buftostr((unsigned char*)tb,strlen(tb)*sizeof(char),te,512);
buftostr((unsigned char*)tc,strlen(tc)*sizeof(char),tf,512);
cout<<"Big5 to WCHAR to Big5("<<size7<<"):"<<td<<endl;
cout<<"UTF8 to WCHAR to UTF8("<<size8<<"):"<<te<<endl;
cout<<"UTF16 to WCHAR to UTF16("<<size9<<"):"<<tf<<endl;
cout<<"=========================================================="<<endl;
char* cend;
cend = (char*)utf8::unchecked::utf8to16(t2, t2 + size2 + 1, (unsigned short*)t5);
size5 = cend - t5 - 2;
cend = utf8::unchecked::utf16to8((unsigned short*)t3, (unsigned short*)(t3 + size3 + 2), t6);
size6 = cend - t6 - 1;
buftostr((unsigned char*)t5,size5*sizeof(char),t9,512);
buftostr((unsigned char*)t6,size6*sizeof(char),ta,512);
cout<<"UTF8 to UTF16("<<size5<<"):"<<t9<<endl;
cout<<"UTF16 to UTF8("<<size6<<"):"<<ta<<endl;
cend = (char*)utf8::unchecked::utf8to32(t2, t2 + size2 + 1, (unsigned int*)t7);
size7 = cend - t7 - 4;
cend = (char*)utf8::unchecked::utf8to32(t6, t6 + size6 + 1, (unsigned int*)t8);
size8 = cend - t8 - 4;
buftostr((unsigned char*)t7,size7*sizeof(char),t9,512);
buftostr((unsigned char*)t8,size8*sizeof(char),ta,512);
cout<<"UTF8 to UTF32("<<size7<<"):"<<t9<<endl;
cout<<"UTF16 to UTF8 to UTF32("<<size8<<"):"<<ta<<endl;
size4 = AnsiToWCHAR(wt4, 256, t1 ,size1+1,CHINESE_TAIWAN);
wchar_t* wtp = utf8::unchecked::UTF8TOWCHAR(t2, t2 + size2 + 1, wt2);
size9 = wtp - wt2 - 1;
buftostr((unsigned char*)wt4,size4*sizeof(wchar_t),t9,512);
buftostr((unsigned char*)wt2,size9*sizeof(wchar_t),ta,512);
cout<<"ANSI to WCHAR("<<size4<<"):"<<t9<<endl;
cout<<"UTF8 to WCHAR("<<size9<<"):"<<ta<<endl;
size2 = WCHARToAnsi(t2,512,wt2,size9,CHINESE_TAIWAN);
cend = utf8::unchecked::WCHARTOUTF8(wt4,wt4+size4 + 1,t3);
size3 = cend - t3 - 1;
buftostr((unsigned char*)t2,size2*sizeof(char),t9,512);
buftostr((unsigned char*)t3,size3*sizeof(char),ta,512);
cout<<"ANSI to WCHAR to UTF8("<<size3<<"):"<<ta<<endl;
cout<<"UTF8 to WCHAR to ANSI("<<size2<<"):"<<t9<<endl;
size4 = UTF8ToAnsi(t4,512,t3,size3,CHINESE_TAIWAN);
size5 = AnsiToUTF8(t5,512,t2,size2,CHINESE_TAIWAN);
buftostr((unsigned char*)t4,size4*sizeof(char),t9,512);
buftostr((unsigned char*)t5,size5*sizeof(char),ta,512);
cout<<"ANSI to WCHAR to UTF8 to ANSI("<<size4<<"):"<<t9<<endl;
cout<<"UTF8 to WCHAR to ANSI to UTF8("<<size5<<"):"<<ta<<endl;
return 0;
}
* 使用 "中文測試許功蓋中文測試許功蓋" 作為測試檔內容
* Windows XP + VS2008
CHAR SIZE:1 WCHAR SIZE:2
Big5(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF16(28):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
Chinese_Taiwan.950
set locale 65001(UTF-8) Error!
set locale 1200(UTF-16) Error!
Big5 to WCHAR(14):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
UTF8 to WCHAR(-1):
UTF16 to WCHAR(-1):
Chinese_Taiwan.950
set locale 65001(UTF-8) Error!
set locale 1200(UTF-16) Error!
Big5 to WCHAR to Big5(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8 to WCHAR to UTF8(0):
UTF16 to WCHAR to UTF16(0):
==========================================================
UTF8 to UTF16(28):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
UTF16 to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF8 to UTF32(56):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
UTF16 to UTF8 to UTF32(56):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
ANSI to WCHAR(14):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
UTF8 to WCHAR(14):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
ANSI to WCHAR to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF8 to WCHAR to ANSI(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
ANSI to WCHAR to UTF8 to ANSI(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8 to WCHAR to ANSI to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
* Linux + gcc 4.2.4
CHAR SIZE:1 WCHAR SIZE:4
Big5(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF16(28):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
zh_TW
zh_TW.UTF-8
set locale 1200(UTF-16) Error!
Big5 to WCHAR(14):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
UTF8 to WCHAR(14):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
UTF16 to WCHAR(-1):2D0000004E000000
zh_TW
zh_TW.UTF-8
set locale 1200(UTF-16) Error!
Big5 to WCHAR to Big5(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8 to WCHAR to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF16 to WCHAR to UTF16(2):2D4E
==========================================================
UTF8 to UTF16(28):2D4E87652C6E668A318A9F52CB842D4E87652C6E668A318A9F52CB84
UTF16 to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF8 to UTF32(56):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
UTF16 to UTF8 to UTF32(56):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
ANSI to WCHAR(14):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
UTF8 to WCHAR(14):2D4E0000876500002C6E0000668A0000318A00009F520000CB8400002D4E0000876500002C6E0000668A0000318A00009F520000CB840000
ANSI to WCHAR to UTF8(28):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B
UTF8 to WCHAR to ANSI(42):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
ANSI to WCHAR to UTF8 to ANSI(28):A4A4A4E5B4FAB8D5B35CA55CBB5CA4A4A4E5B4FAB8D5B35CA55CBB5C
UTF8 to WCHAR to ANSI to UTF8(42):E4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938BE4B8ADE69687E6B8ACE8A9A6E8A8B1E58A9FE8938B