字符编码认知、互相转换和C++代码判断是UTF8还是GBK

一、unicode编码基础认知

二、C++如何判断是否是UTF8编码

形式1:详见mozilla xpcom\string\nsReadableUtils.cpp:

只要有一个字符不满足UTF8判断条件,就返回false

bool
IsUTF8(const nsACString& aString, bool aRejectNonChar)
{
  nsReadingIterator<char> done_reading;
  aString.EndReading(done_reading);

  int32_t state = 0;
  bool overlong = false;
  bool surrogate = false;
  bool nonchar = false;
  uint16_t olupper = 0; // overlong byte upper bound.
  uint16_t slower = 0;  // surrogate byte lower bound.

  nsReadingIterator<char> iter;
  aString.BeginReading(iter);

  const char* ptr = iter.get();
  const char* end = done_reading.get();
  while (ptr < end) {
    uint8_t c;

    if (0 == state) {
      c = *ptr++;

      if (UTF8traits::isASCII(c)) {
        continue;
      }

      if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.
        return false;
      } else if (UTF8traits::is2byte(c)) {
        state = 1;
      } else if (UTF8traits::is3byte(c)) {
        state = 2;
        if (c == 0xE0) { // to exclude E0[80-9F][80-BF]
          overlong = true;
          olupper = 0x9F;
        } else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepoint
          surrogate = true;
          slower = 0xA0;
        } else if (c == 0xEF) { // EF BF [BE-BF] : non-character
          nonchar = true;
        }
      } else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
        state = 3;
        nonchar = true;
        if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}
          overlong = true;
          olupper = 0x8F;
        } else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]
          // actually not surrogates but codepoints beyond 0x10FFFF
          surrogate = true;
          slower = 0x90;
        }
      } else {
        return false;  // Not UTF-8 string
      }
    }

    if (nonchar && !aRejectNonChar) {
      nonchar = false;
    }

    while (ptr < end && state) {
      c = *ptr++;
      --state;

      // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
      if (nonchar &&
          ((!state && c < 0xBE) ||
           (state == 1 && c != 0xBF)  ||
           (state == 2 && 0x0F != (0x0F & c)))) {
        nonchar = false;
      }

      if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||
          (surrogate && slower <= c) || (nonchar && !state)) {
        return false;  // Not UTF-8 string
      }

      overlong = surrogate = false;
    }
  }
  return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
}

形式二:C++ 标准版

bool IsUTF8(const void *pBuffer, int size)
{
	bool IsUTF8 = false;
	unsigned char *start = (unsigned char *)pBuffer;
	unsigned char *end = (unsigned char *)pBuffer + size;
	while (start < end)
	{
		if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符 
		{
			start++;
		}
		else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符 
		{
			IsUTF8 = false;
			break;
		}
		else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符 
		{
			IsUTF8 = true;
			if (start >= end - 1)
				break;
			if ((start[1] & (0xC0)) != 0x80)
			{
				IsUTF8 = false;
				break;
			}
			start += 2;
		}
		else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符 
		{
			IsUTF8 = true;
			if (start >= end - 2) break;
			if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
			{
				IsUTF8 = false; break;
			}
			start += 3;
		}
		else if (*start < (0xF8)) // (11111000): 此范围内为4字节UTF-8字符 
		{
			IsUTF8 = true;
			if (start >= end - 3) break;
			if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80 || (start[3] & (0xC0)) != 0x80)
			{
				IsUTF8 = false; break;
			}
			start += 4;
		}
		else
		{
			IsUTF8 = false;
			break;
		}
	}
	return IsUTF8;
}

形式一、形式二都存在一定的缺陷:

例如 "通知"两个字的GB2312编码为:0xCD0xA8(通)、0xD60xAA(知) 使用这两种形式的代码都会被误判为是UTF8编码,具体那些字段会被误判,可查看GB2312编码表《CP936.TXT》

三、GBK编码基础认知

判断是否是GBK编码:

形式一、C++通用版本

bool isGBKCode(const string& strIn)
{
    unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 
    unsigned char chr = strIn.at(0);
    bool bAllAscii = true; //如果全部都是ASCII,  

    for (unsigned int i = 0; strIn[i] != '\0'; ++i){
        chr = strIn.at(i);
        if ((chr & 0x80) != 0 && nBytes == 0){// 判断是否ASCII编码,如果不是,说明有可能是GBK
            bAllAscii = false;
        }

        if (nBytes == 0) {
            if (chr >= 0x80) {
                if (chr >= 0x81 && chr <= 0xFE){
                    nBytes = +2;
                }
                else{
                    return false;
                }

                nBytes--;
            }
        }
        else{
            if (chr < 0x40 || chr>0xFE){
                return false;
            }
            nBytes--;
        }//else end
    }

    if (nBytes != 0)  {     //违返规则 
        return false;
    }

    if (bAllAscii){ //如果全部都是ASCII, 也是GBK
        return true;
    }

    return true;
}

形式二、适合mozilla的代码如下:

bool
IsGBK(const nsACString& aString)
{
  nsReadingIterator<char> done_reading;
  aString.EndReading(done_reading);

  //GBK:英文字母和数字占用一个字节,特殊字符(如部分标点符号、
  //非常见字母等)仍占用两个字节,汉字编码占用两个字节。
  unsigned int nBytes = 0;
  bool bAllAscii = true; //如果为true,全部都是ASCII

  nsReadingIterator<char> iter;
  aString.BeginReading(iter);

  const char* ptr = iter.get();
  const char* end = done_reading.get();
  while (ptr < end) {
      uint8_t c;

      c = *ptr++;

      if ((c & 0x80) != 0 && nBytes == 0) {
        bAllAscii = false;
      }
      if (nBytes == 0) {
        if (c >= 0x80) {
          if (c >= 0x81 && c <= 0xFE) {
            nBytes = +2;
          } else {
            return false;
          }
          nBytes--;
        }
      } else {
        if (c < 0x40 || c > 0xFE) {
          return false;
        }
        nBytes--;
      }
  }

  if (nBytes != 0) {
      return false;
  }

  //如果全是ASCII码,就认为不是GBK,还是以UTF8编码为准,不改变原逻辑
  if (bAllAscii) {
      return false;
  }

  return true;
}

四、GB2312和UTF8互转

在该网站可看unicode编码库:

Index of /Public/MAPPINGS/VENDORS/MICSFT/WINDOWSicon-default.png?t=N7T8https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/

其中,CP936.TXT就是GB2312编码库; 该文档部分截图如下:

以"知"字为例:GB2312编码为:0xD6AA   对应的UTF8编码为 0x77E5;

在在线转换网站UTF-8编码转换 UTF-8转换工具 在线UTF-8编码汉字互转工具 iP138在线工具在线UTF-8编码汉字互转工具是一款可以帮助你把中文转换成UTF-8编码,同时也支持把UTF-8编码过的还原成中文的小工具。icon-default.png?t=N7T8https://tool.ip138.com/utf8/上可以互相转换:

最近更新

  1. TCP协议是安全的吗?

    2024-02-07 05:42:05       18 阅读
  2. 阿里云服务器执行yum,一直下载docker-ce-stable失败

    2024-02-07 05:42:05       19 阅读
  3. 【Python教程】压缩PDF文件大小

    2024-02-07 05:42:05       18 阅读
  4. 通过文章id递归查询所有评论(xml)

    2024-02-07 05:42:05       20 阅读

热门阅读

  1. C#面:.NET中的垃圾回收机制(GC)

    2024-02-07 05:42:05       34 阅读
  2. JVM探险-JIT技术

    2024-02-07 05:42:05       28 阅读
  3. python实现rdbms和neo4j的转换

    2024-02-07 05:42:05       31 阅读
  4. ubuntu22.04编译安装mjpg-streamer

    2024-02-07 05:42:05       34 阅读
  5. tqdm+enumerate+zip组合使用

    2024-02-07 05:42:05       27 阅读
  6. BC107 矩阵转置

    2024-02-07 05:42:05       30 阅读
  7. linux系统lvs命令的使用

    2024-02-07 05:42:05       32 阅读
  8. k8s filebeat 应用日志搜集

    2024-02-07 05:42:05       35 阅读
  9. 学习总结13

    2024-02-07 05:42:05       22 阅读
  10. 9、nfs-subdir-external-provisioner

    2024-02-07 05:42:05       32 阅读
  11. ChatGPT高效提问—基础知识(LM、PLM以及LLM)

    2024-02-07 05:42:05       70 阅读
  12. sql——如果查到的值为空,则显示给出的默认值

    2024-02-07 05:42:05       33 阅读
  13. centos ssh 默认端口 修改

    2024-02-07 05:42:05       39 阅读