VC 编程ANSI环境下读写Unicode文件

没有注意到文件编码的不同会产生这么多的问题,在动手以前查询了很多资料,在本博客中收藏了不少先辈的成果,在这里一并表示致敬!

关于ANSI和Unicode编码的原理在这里也不说了,主要讲下如何读写!


首先确定你的工程是采用的是什么编码环境,默认是ANSI,不同的字符集读写文件的差别也比较大,我这里只在ANSI环境下做的,下一步在探索在Unicode环境下如何读写!(原先这个没搞懂,读了不少代码发现自己试验都是有误的)。

在ANSI的字符集下,CString等都是单字节版本的,所以一定要注意。而多要读取的Unicode文件却是双字节的,这里就要转换了,当然在ANSI字符集下,还是用二进制的方式打开Unicode文件,自己判断是否是换行,在转化成ANSI编码。而在写Unicode的时候,先将所字符转化成Unicode编码再写入,而且在写文件之前一定要加上Unicode文件的标识。







下面是读




CFile mFile(UnicodefilePath,CFile::modeRead);

byte head[2];

mFile.Read(head,2);

if((head[0]==0xff&&head[1]==0xfe)||(head[0]==0xfe&&head[1]==0xff)   )

{


//AfxMessageBox(_T("File is Unicode!"));


isUnicode = true;

}

if(isUnicode) mFile.Seek(2,CFile::begin);

//0xfffe


wchar_t wch;

wchar_t wstr[300];


CString strvalue ;

hile(mFile.Read((char *)&wch,2)>0)

{

if(wch==0x000D)

//by line


{


//chang to ansi


int nLen = i;

char *buf = new char[2*nLen];

WideCharToMultiByte(CP_ACP, 0, wstr, nLen, buf, 2*nLen, NULL, NULL);

buf[2*nLen-1] = 0;

//some assertion failed,这个比较重要,小问题可以折腾人啊


strvalue = buf;

mFile.Seek(2,CFile::current);

//跳过行开头符号


i=0;

}

else

{

wstr[i++] = wch;

}

}







//下面是写




CStdioFile transFile;

transFile.Open(strUnicodeSavepath,CFile::modeCreate|CFile::modeWrite|CFile::typeBinary);

WORD wSignature = 0xFEFF;

transFile.Write(&wSignature, 2);

//Unicode的文件符号


CHAR *pszAnsi = new TCHAR[strvalue.GetLength()+1];

_tcscpy(pszAnsi, strvalue);

WCHAR * szwBuffer = new WCHAR[strvalue.GetLength()+1];

MultiByteToWideChar(CP_ACP, 0, pszAnsi, -1, szwBuffer, strvalue.GetLength()+1);

//write to files

transFile.Write(szwBuffer, lstrlenW(szwBuffer) * sizeof(WCHAR));




当然你可以把你的工程设置成Unicode的字符集,这个时候在Unicode的工程下读取ANSI文件又是一个烦人的事情,讲文件读到CString中的时候,每个单字节的ANSI被转成了双字节,需要自己来处理,后面我再探索探索再来记录。


本文来自CSDN博客

,转载请标明出处:

http://blog.csdn.net/Augusdi/archive/2009/10/15/4677528.aspx

============================================================================



将CStdioFile类扩展,读取UNICODE文本文件

最近因为要读取SQL脚本文件,用CStdioFile来读取脚本文件,却在程序调试时读取不了文件。


后来看了一下文本文件格式,竟然是UNICODE格式的,原来在导出SQL脚本的时候,选项默认的是UNICODE格式。为了同时支持ANSI和UNICODE两种格式,在Codeproject站点上找到了CStdioFileEx类的代码,但在调试运行过程中发现,在生成UNICODE版本的执行文件时,运行没有错误,但在非UNICODE版本中却出现错误,原来在代码中此部分没有考虑文件读到末尾的情况,经修改,CStdioFileEx类就可以正常使用了,在读取文本文件时,自动识别ANSI和UNICODE两种格式。



实现头文件如下:



#define nUNICODE_BOM      0xFEFF  // Unicode "byte order mark" which goes at start of file

#define sNEWLINE        _T("/r/n") // New line characters

#define sDEFAULT_UNICODE_FILLER_CHAR "#"   // Filler char used when no conversion from Unicode to local code page is possible

class CStdioFileEx: public CStdioFile

{

public:

CStdioFileEx();

CStdioFileEx( LPCTSTR lpszFileName, UINT nOpenFlags );

virtual BOOL Open( LPCTSTR lpszFileName, UINT nOpenFlags, CFileException* pError = NULL );

virtual BOOL ReadString(CString& rString);

virtual void WriteString( LPCTSTR lpsz );

bool    IsFileUnicodeText() { return m_bIsUnicodeText; }

unsigned long GetCharCount();

// Additional flag to allow Unicode text writing

static const UINT modeWriteUnicode;

// static utility functions

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::GetUnicodeStringFromMultiByteString()

//

// --------------------------------------------------------------------------------------------

// Returns:    bool

// Parameters: char *  szMultiByteString  (IN) Multi-byte input string

//     wchar_t*  szUnicodeString  (OUT) Unicode output string

//     short   nUnicodeBufferSize (IN) Size of Unicode output buffer

//     UINT   nCodePage    (IN) Code page used to perform conversion

//                  Default = -1 (Get local code page).

//

// Purpose:  Gets a Unicode string from a MultiByte string.

// Notes:  None.

// Exceptions: None.

//

static bool  GetUnicodeStringFromMultiByteString(char * szMultiByteString,wchar_t* szUnicodeString,

short nUnicodeBufferSize,UINT nCodePage=-1);

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::GetMultiByteStringFromUnicodeString()

//

// --------------------------------------------------------------------------------------------

// Returns:    BOOL

// Parameters: wchar_t * szUnicodeString   (IN) Unicode input string

//     char*   szMultiByteString   (OUT) Multibyte output string

//     short   nMultiByteBufferSize  (IN) Multibyte buffer size

//     UINT   nCodePage     (IN) Code page used to perform conversion

//                   Default = -1 (Get local code page).

//

// Purpose:  Gets a MultiByte string from a Unicode string.

// Notes:  .

// Exceptions: None.

//

static BOOL   GetMultiByteStringFromUnicodeString(wchar_t * szUnicodeString,char* szMultiByteString,

short nMultiByteBufferSize,UINT nCodePage=-1);


// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::IsFileUnicode()

//

// --------------------------------------------------------------------------------------------

// Returns:    bool

// Parameters: const CString& sFilePath

//

// Purpose:  Determines whether a file is Unicode by reading the first character and detecting

//     whether it's the Unicode byte marker.

// Notes:  None.

// Exceptions: None.

//

static bool IsFileUnicode(const CString& sFilePath);


protected:

UINT ProcessFlags(const CString& sFilePath, UINT& nOpenFlags);

bool  m_bIsUnicodeText;

UINT  m_nFlags;

};



实现文件如下:



/*static*/ const UINT CStdioFileEx::modeWriteUnicode = 0x20000; // Add this flag to write in Unicode

CStdioFileEx::CStdioFileEx(): CStdioFile()

{

m_bIsUnicodeText = false;

}

CStdioFileEx::CStdioFileEx(LPCTSTR lpszFileName,UINT nOpenFlags)

:CStdioFile(lpszFileName, ProcessFlags(lpszFileName, nOpenFlags))

{

}

BOOL CStdioFileEx::Open(LPCTSTR lpszFileName,UINT nOpenFlags,CFileException* pError /*=NULL*/)

{

// Process any Unicode stuff

ProcessFlags(lpszFileName, nOpenFlags);

return CStdioFile::Open(lpszFileName, nOpenFlags, pError);

}

BOOL CStdioFileEx::ReadString(CString& rString)

{

const int nMAX_LINE_CHARS = 4096;

BOOL   bReadData;

LPTSTR  lpsz;

int   nLen = 0; //, nMultiByteBufferLength = 0, nChars = 0;

CString  sTemp;

wchar_t*  pszUnicodeString = NULL;

char *  pszMultiByteString= NULL;

// If at position 0, discard byte-order mark before reading

if (!m_pStream || (GetPosition() == 0 && m_bIsUnicodeText))

{

wchar_t cDummy;

//  Read(&cDummy, sizeof(_TCHAR));

Read(&cDummy, sizeof(wchar_t));

}

// If compiled for Unicode

#ifdef _UNICODE

// Do standard stuff -- both ANSI and Unicode cases seem to work OK

bReadData = CStdioFile::ReadString(rString);

#else

if (!m_bIsUnicodeText)

{

// Do standard stuff -- read ANSI in ANSI

bReadData = CStdioFile::ReadString(rString);

}

else

{

pszUnicodeString = new wchar_t[nMAX_LINE_CHARS];

pszMultiByteString= new char[nMAX_LINE_CHARS];

// Read as Unicode, convert to ANSI

if(fgetws(pszUnicodeString, nMAX_LINE_CHARS, m_pStream)==NULL)

{

bReadData=FALSE;

}

else

{

bReadData=TRUE;

if (GetMultiByteStringFromUnicodeString(pszUnicodeString, pszMultiByteString, nMAX_LINE_CHARS))

{

rString = (CString)pszMultiByteString;

}

if (pszUnicodeString)

{

delete pszUnicodeString;

}

if (pszMultiByteString)

{

delete pszMultiByteString;

}

}

}

#endif

// Then remove end-of-line character if in Unicode text mode

if (bReadData)

{

// Copied from FileTxt.cpp but adapted to Unicode and then adapted for end-of-line being just '/r'.

nLen = rString.GetLength();

if (nLen > 1 && rString.Mid(nLen-2) == sNEWLINE)

{

rString.GetBufferSetLength(nLen-2);

}

else

{

lpsz = rString.GetBuffer(0);

if (nLen != 0 && (lpsz[nLen-1] == _T('/r') || lpsz[nLen-1] == _T('/n')))

{

rString.GetBufferSetLength(nLen-1);

}

}

}

return bReadData;

}

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::WriteString()

//

// --------------------------------------------------------------------------------------------

// Returns:    void

// Parameters: LPCTSTR lpsz

//

// Purpose:  Writes string to file either in Unicode or multibyte, depending on whether the caller specified the

//     CStdioFileEx::modeWriteUnicode flag. Override of base class function.

// Notes:  If writing in Unicode we need to:

//      a) Write the Byte-order-mark at the beginning of the file

//      b) Write all strings in byte-mode

//     - If we were compiled in Unicode, we need to convert Unicode to multibyte if

//      we want to write in multibyte

//     - If we were compiled in multi-byte, we need to convert multibyte to Unicode if

//      we want to write in Unicode.

// Exceptions: None.

//

void CStdioFileEx::WriteString(LPCTSTR lpsz)

{

// If writing Unicode and at the start of the file, need to write byte mark

if (m_nFlags & CStdioFileEx::modeWriteUnicode)

{

// If at position 0, write byte-order mark before writing anything else

if (!m_pStream || GetPosition() == 0)

{

wchar_t cBOM = (wchar_t)nUNICODE_BOM;

CFile::Write(&cBOM, sizeof(wchar_t));

}

}

// If compiled in Unicode...

#ifdef _UNICODE

// If writing Unicode, no conversion needed

if (m_nFlags & CStdioFileEx::modeWriteUnicode)

{

// Write in byte mode

CFile::Write(lpsz, lstrlen(lpsz) * sizeof(wchar_t));

}

// Else if we don't want to write Unicode, need to convert

else

{

int  nChars = lstrlen(lpsz) + 1;    // Why plus 1? Because yes

int  nBufferSize = nChars * sizeof(char);

wchar_t* pszUnicodeString = new wchar_t[nChars];

char * pszMultiByteString= new char[nChars];

// Copy string to Unicode buffer

lstrcpy(pszUnicodeString, lpsz);

// Get multibyte string

if (GetMultiByteStringFromUnicodeString(pszUnicodeString, pszMultiByteString, nBufferSize, GetACP()))

{

// Do standard write

CFile::Write((const void*)pszMultiByteString, lstrlen(lpsz));

}

if (pszUnicodeString && pszMultiByteString)

{

delete [] pszUnicodeString;

delete [] pszMultiByteString;

}

}

// Else if *not* compiled in Unicode

#else

// If writing Unicode, need to convert

if (m_nFlags & CStdioFileEx::modeWriteUnicode)

{

int  nChars = lstrlen(lpsz) + 1;  // Why plus 1? Because yes

int  nBufferSize = nChars * sizeof(wchar_t);

wchar_t* pszUnicodeString = new wchar_t[nChars];

char * pszMultiByteString= new char[nChars];

// Copy string to multibyte buffer

lstrcpy(pszMultiByteString, lpsz);

if (GetUnicodeStringFromMultiByteString(pszMultiByteString, pszUnicodeString, nBufferSize, GetACP()))

{

// Write in byte mode

CFile::Write(pszUnicodeString, lstrlen(lpsz) * sizeof(wchar_t));

}

else

{

ASSERT(false);

}

if (pszUnicodeString && pszMultiByteString)

{

delete [] pszUnicodeString;

delete [] pszMultiByteString;

}

}

// Else if we don't want to write Unicode, no conversion needed

else

{

// Do standard stuff

CStdioFile::WriteString(lpsz);

}

#endif

}

UINT CStdioFileEx::ProcessFlags(const CString& sFilePath, UINT& nOpenFlags)

{

m_bIsUnicodeText = false;

// If we have writeUnicode we must have write or writeRead as well

#ifdef _DEBUG

if (nOpenFlags & CStdioFileEx::modeWriteUnicode)

{

ASSERT(nOpenFlags & CFile::modeWrite || nOpenFlags & CFile::modeReadWrite);

}

#endif

// If reading in text mode and not creating...

if (nOpenFlags & CFile::typeText && !(m_nFlags & CFile::modeCreate) && !(m_nFlags & CFile::modeWrite ))

{

m_bIsUnicodeText = IsFileUnicode(sFilePath);

// If it's Unicode, switch to binary mode

if (m_bIsUnicodeText)

{

nOpenFlags ^= CFile::typeText;

nOpenFlags |= CFile::typeBinary;

}

}

m_nFlags = nOpenFlags;

return nOpenFlags;

}

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::IsFileUnicode()

//

// --------------------------------------------------------------------------------------------

// Returns:    bool

// Parameters: const CString& sFilePath

//

// Purpose:  Determines whether a file is Unicode by reading the first character and detecting

//     whether it's the Unicode byte marker.

// Notes:  None.

// Exceptions: None.

//

/*static*/ bool CStdioFileEx::IsFileUnicode(const CString& sFilePath)

{

CFile    file;

bool    bIsUnicode = false;

wchar_t   cFirstChar;

CFileException exFile;

// Open file in binary mode and read first character

if (file.Open(sFilePath, CFile::typeBinary | CFile::modeRead, &exFile))

{

// If byte is Unicode byte-order marker, let's say it's Unicode

if (file.Read(&cFirstChar, sizeof(wchar_t)) > 0 && cFirstChar == (wchar_t)nUNICODE_BOM)

{

bIsUnicode = true;

}

file.Close();

}

else

{

// Handle error here if you like

}

return bIsUnicode;

}

unsigned long CStdioFileEx::GetCharCount()

{

int    nCharSize;

unsigned long nByteCount, nCharCount = 0;

if (m_pStream)

{

// Get size of chars in file

nCharSize = m_bIsUnicodeText ? sizeof(wchar_t): sizeof(char);

// If Unicode, remove byte order mark from count

nByteCount = (unsigned long)GetLength();

if (m_bIsUnicodeText)

{

nByteCount = nByteCount - sizeof(wchar_t);

}

// Calc chars

nCharCount = (nByteCount / nCharSize);

}

return nCharCount;

}

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::GetUnicodeStringFromMultiByteString()

//

// --------------------------------------------------------------------------------------------

// Returns:    bool

// Parameters: char *  szMultiByteString  (IN) Multi-byte input string

//     wchar_t*  szUnicodeString  (OUT) Unicode outputstring

//     short   nUnicodeBufferSize (IN) Size of Unicode output buffer

//     UINT   nCodePage    (IN) Code page used to perform conversion

//                  Default = -1 (Get local code page).

//

// Purpose:  Gets a Unicode string from a MultiByte string.

// Notes:  None.

// Exceptions: None.

//

bool CStdioFileEx::GetUnicodeStringFromMultiByteString(char * szMultiByteString, wchar_t* szUnicodeString, short nUnicodeBufferSize, UINT nCodePage)

{

bool  bOK = true;

int  nReturn = 0;

CString sErrorMsg;

if (szUnicodeString && szMultiByteString)

{

// If no code page specified, take default for system

if (nCodePage == -1)

{

nCodePage = GetACP();

}

try

{

nReturn = MultiByteToWideChar(nCodePage,MB_PRECOMPOSED,szMultiByteString,-1,szUnicodeString,nUnicodeBufferSize);

if (nReturn == 0)

{

bOK = false;

}

}

catch(...)

{

bOK = false;

}

}

else

{

bOK = false;

}

ASSERT(bOK);

return bOK;

}

// --------------------------------------------------------------------------------------------

//

// CStdioFileEx::GetMultiByteStringFromUnicodeString()

//

// --------------------------------------------------------------------------------------------

// Returns:    BOOL

// Parameters: wchar_t * szUnicodeString   (IN) Unicode input string

//     char*   szMultiByteString   (OUT) Multibyte output string

//     short   nMultiByteBufferSize  (IN) Multibyte buffer size

//     UINT   nCodePage     (IN) Code page used to perform conversion

//                   Default = -1 (Get local code page).

//

// Purpose:  Gets a MultiByte string from a Unicode string

// Notes:  None.

// Exceptions: None.

//

BOOL CStdioFileEx::GetMultiByteStringFromUnicodeString(wchar_t * szUnicodeString, char* szMultiByteString,

short nMultiByteBufferSize, UINT nCodePage)

{

BOOL   bUsedDefChar = FALSE;

BOOL   bGotIt = FALSE;

if (szUnicodeString && szMultiByteString)

{

// If no code page specified, take default for system

if (nCodePage == -1)

{

nCodePage = GetACP();

}

try

{

bGotIt = WideCharToMultiByte(nCodePage, WC_COMPOSITECHECK | WC_SEPCHARS,

szUnicodeString,-1, szMultiByteString, nMultiByteBufferSize, sDEFAULT_UNICODE_FILLER_CHAR, &bUsedDefChar);

}

catch(...)

{

TRACE(_T("Controlled exception in WideCharToMultiByte!/n"));

}

}

return bGotIt;

}


本文来自CSDN博客

,转载请标明出处:

http://blog.csdn.net/Augusdi/archive/2009/10/15/4677520.aspx