当前位置：首页 > news >正文

获取UTF8编码文本长度, 检测符合UTF8编码

news 2026/1/13 1:00:02

// 
// @brief: 获取UTF8字符个数
// @param: strContent   文本内容
// @ret: int 若返回值 >= 0, 表示字符个数, 若返回值 < 0, 表示文本内容不是合法的 UTF8 编码字符串
int GetUtf8CharacterCount(const std::string& strContent)
{bool fResult = true;    // 操作结果bool fBom = true;       // BOM(Byte Order Mark)int nByteCount = 0;     // 字节计数int nChCount = 0;       // 字符计数for (const unsigned char ch: strContent){// 普通 Ascii 也是utf8一部分if (ch < 0x7F){nChCount++;continue;}// 检查 UTF-8 首字节if (0 == nByteCount){if (ch >= 0xC0){uint8_t u8CodeMask  = 0xC0;     // 11000000uint8_t u8DataMask = 0x1F;      // 000xxxxxint nCount = 2;                 // 有效字节数量: 2-6// 检索字符使用的字节数量while(u8CodeMask <= 0xFC){uint8_t u8MaskMax = u8CodeMask | u8DataMask;if (ch >= u8CodeMask && ch <= u8MaskMax){nByteCount = nCount;break;}u8CodeMask = (u8CodeMask >> 1) | 0x80;u8DataMask = u8DataMask >> 1;nCount++;}if (0 == nByteCount){fResult = false;break;}if (0xEF == ch && 3 == nByteCount){fBom = true;}nByteCount--;}else{fResult = false;break;}}else{// 非首字节掩码: 10xxxxxxif (0x80 != (ch & 0xC0)){fResult = false;break;}if (fBom){if (0xBB != ch && 2 == nByteCount){fBom = false;}if (0xBF != ch && 1 == nByteCount){fBom = false;}}nByteCount--;if (0 == nByteCount){if (!fBom){nChCount++;}fBom = false;}}}if (!fResult){return -1;}return nChCount;
}

#include <iostream>
#include <fstream>int main(int argc, char* argv[])
{std::string strContent = u8"1234567890汉字编码";int uCount = GetUtf8CharacterCount(strContent);return 0;
}