文本编码详解：HTML实体、ASCII、Unicode与字符编码原理

2024-01-17 - QubitTool 技术团队

字符编码是计算机处理文本的基础。理解编码原理不仅能帮助解决乱码问题，还能在 Web 安全、国际化开发中发挥重要作用。本文将深入讲解各种编码方式的原理和实现。

字符编码基础

为什么需要字符编码？

计算机只能处理数字（二进制），而人类使用文字。字符编码就是建立字符与数字之间的映射关系：

code

字符 'A' → 数字 65 → 二进制 01000001
字符 '中' → 数字 20013 → 二进制 ...

编码发展历程

code

ASCII (1963) → 扩展ASCII → ISO-8859 → Unicode (1991) → UTF-8/UTF-16
     ↓              ↓           ↓            ↓
   7位/128字符   8位/256字符   区域编码    统一编码

ASCII 编码

ASCII 基础

ASCII（American Standard Code for Information Interchange）是最基础的字符编码：

范围：0-127（7位）
字符数：128个
包含：英文字母、数字、标点、控制字符

ASCII 码表

范围	类型	示例
0-31	控制字符	NUL, TAB, LF, CR
32-47	标点符号	空格, !, ", #
48-57	数字	0-9
65-90	大写字母	A-Z
97-122	小写字母	a-z
123-127	其他符号	{,

ASCII 转换实现

javascript

class ASCIIConverter {
  static charToCode(char) {
    return char.charCodeAt(0);
  }

  static codeToChar(code) {
    return String.fromCharCode(code);
  }

  static stringToASCII(str) {
    return Array.from(str).map(char => ({
      char,
      decimal: char.charCodeAt(0),
      hex: char.charCodeAt(0).toString(16).toUpperCase(),
      binary: char.charCodeAt(0).toString(2).padStart(8, '0')
    }));
  }

  static asciiToString(codes) {
    return codes.map(code => String.fromCharCode(code)).join('');
  }

  static isASCII(str) {
    return /^[\x00-\x7F]*$/.test(str);
  }

  static toUpperCase(char) {
    const code = char.charCodeAt(0);
    if (code >= 97 && code <= 122) {
      return String.fromCharCode(code - 32);
    }
    return char;
  }

  static toLowerCase(char) {
    const code = char.charCodeAt(0);
    if (code >= 65 && code <= 90) {
      return String.fromCharCode(code + 32);
    }
    return char;
  }
}

// 使用示例
console.log(ASCIIConverter.stringToASCII('Hello'));
// [
//   { char: 'H', decimal: 72, hex: '48', binary: '01001000' },
//   { char: 'e', decimal: 101, hex: '65', binary: '01100101' },
//   ...
// ]

console.log(ASCIIConverter.asciiToString([72, 101, 108, 108, 111]));
// "Hello"

Python ASCII 实现

python

class ASCIIConverter:
    @staticmethod
    def char_to_code(char: str) -> int:
        return ord(char)
    
    @staticmethod
    def code_to_char(code: int) -> str:
        return chr(code)
    
    @staticmethod
    def string_to_ascii(s: str) -> list:
        return [
            {
                'char': char,
                'decimal': ord(char),
                'hex': hex(ord(char))[2:].upper(),
                'binary': bin(ord(char))[2:].zfill(8)
            }
            for char in s
        ]
    
    @staticmethod
    def ascii_to_string(codes: list) -> str:
        return ''.join(chr(code) for code in codes)
    
    @staticmethod
    def is_ascii(s: str) -> bool:
        return all(ord(char) < 128 for char in s)

# 使用示例
print(ASCIIConverter.string_to_ascii('Hello'))
print(ASCIIConverter.ascii_to_string([72, 101, 108, 108, 111]))

Unicode 编码

Unicode 基础

Unicode 是一个字符集标准，为世界上所有字符分配唯一的码点（Code Point）：

范围：U+0000 到 U+10FFFF
字符数：超过 140,000 个
表示：U+XXXX（十六进制）

Unicode 平面

平面	范围	名称	内容
0	U+0000-U+FFFF	BMP	常用字符
1	U+10000-U+1FFFF	SMP	Emoji、古文字
2	U+20000-U+2FFFF	SIP	扩展汉字
14	U+E0000-U+EFFFF	SSP	特殊用途

Unicode 转换实现

javascript

class UnicodeConverter {
  static charToCodePoint(char) {
    return char.codePointAt(0);
  }

  static codePointToChar(codePoint) {
    return String.fromCodePoint(codePoint);
  }

  static stringToUnicode(str) {
    const result = [];
    for (const char of str) {
      const codePoint = char.codePointAt(0);
      result.push({
        char,
        codePoint,
        unicode: `U+${codePoint.toString(16).toUpperCase().padStart(4, '0')}`,
        utf8: this.toUTF8Bytes(codePoint),
        utf16: this.toUTF16(codePoint)
      });
    }
    return result;
  }

  static toUTF8Bytes(codePoint) {
    const bytes = [];
    if (codePoint <= 0x7F) {
      bytes.push(codePoint);
    } else if (codePoint <= 0x7FF) {
      bytes.push(0xC0 | (codePoint >> 6));
      bytes.push(0x80 | (codePoint & 0x3F));
    } else if (codePoint <= 0xFFFF) {
      bytes.push(0xE0 | (codePoint >> 12));
      bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
      bytes.push(0x80 | (codePoint & 0x3F));
    } else {
      bytes.push(0xF0 | (codePoint >> 18));
      bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
      bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
      bytes.push(0x80 | (codePoint & 0x3F));
    }
    return bytes.map(b => b.toString(16).toUpperCase().padStart(2, '0'));
  }

  static toUTF16(codePoint) {
    if (codePoint <= 0xFFFF) {
      return [codePoint.toString(16).toUpperCase().padStart(4, '0')];
    }
    // 代理对
    const offset = codePoint - 0x10000;
    const high = 0xD800 + (offset >> 10);
    const low = 0xDC00 + (offset & 0x3FF);
    return [
      high.toString(16).toUpperCase(),
      low.toString(16).toUpperCase()
    ];
  }

  static escapeUnicode(str) {
    return Array.from(str)
      .map(char => {
        const code = char.codePointAt(0);
        if (code > 0xFFFF) {
          return `\\u{${code.toString(16).toUpperCase()}}`;
        }
        return `\\u${code.toString(16).toUpperCase().padStart(4, '0')}`;
      })
      .join('');
  }

  static unescapeUnicode(str) {
    return str.replace(/\\u\{([0-9A-Fa-f]+)\}|\\u([0-9A-Fa-f]{4})/g, 
      (match, p1, p2) => {
        const codePoint = parseInt(p1 || p2, 16);
        return String.fromCodePoint(codePoint);
      }
    );
  }
}

// 使用示例
console.log(UnicodeConverter.stringToUnicode('你好👋'));
// [
//   { char: '你', codePoint: 20320, unicode: 'U+4F60', utf8: ['E4', 'BD', 'A0'], ... },
//   { char: '好', codePoint: 22909, unicode: 'U+597D', utf8: ['E5', 'A5', 'BD'], ... },
//   { char: '👋', codePoint: 128075, unicode: 'U+1F44B', utf8: ['F0', '9F', '91', '8B'], ... }
// ]

console.log(UnicodeConverter.escapeUnicode('Hello 世界'));
// "\u0048\u0065\u006C\u006C\u006F\u0020\u4E16\u754C"

Python Unicode 实现

python

class UnicodeConverter:
    @staticmethod
    def char_to_codepoint(char: str) -> int:
        return ord(char)
    
    @staticmethod
    def codepoint_to_char(codepoint: int) -> str:
        return chr(codepoint)
    
    @staticmethod
    def string_to_unicode(s: str) -> list:
        result = []
        for char in s:
            codepoint = ord(char)
            result.append({
                'char': char,
                'codepoint': codepoint,
                'unicode': f'U+{codepoint:04X}',
                'utf8': s.encode('utf-8').hex().upper(),
                'utf16': char.encode('utf-16-be').hex().upper()
            })
        return result
    
    @staticmethod
    def escape_unicode(s: str) -> str:
        return ''.join(f'\\u{ord(c):04X}' if ord(c) <= 0xFFFF 
                       else f'\\U{ord(c):08X}' for c in s)
    
    @staticmethod
    def unescape_unicode(s: str) -> str:
        return s.encode().decode('unicode_escape')

# 使用示例
print(UnicodeConverter.string_to_unicode('你好'))
print(UnicodeConverter.escape_unicode('Hello 世界'))

UTF-8 编码

UTF-8 原理

UTF-8 是 Unicode 的一种变长编码方式：

Unicode 范围	UTF-8 字节数	编码格式
U+0000-U+007F	1	0xxxxxxx
U+0080-U+07FF	2	110xxxxx 10xxxxxx
U+0800-U+FFFF	3	1110xxxx 10xxxxxx 10xxxxxx
U+10000-U+10FFFF	4	11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

UTF-8 编码示例

以汉字"中"（U+4E2D）为例：

code

1. 码点：0x4E2D = 0100 1110 0010 1101
2. 范围：U+0800-U+FFFF，需要3字节
3. 模板：1110xxxx 10xxxxxx 10xxxxxx
4. 填充：
   - 1110 0100 (E4)
   - 10 111000 (B8)
   - 10 101101 (AD)
5. 结果：E4 B8 AD

UTF-8 编解码实现

javascript

class UTF8Codec {
  static encode(str) {
    const bytes = [];
    for (const char of str) {
      const codePoint = char.codePointAt(0);
      
      if (codePoint <= 0x7F) {
        bytes.push(codePoint);
      } else if (codePoint <= 0x7FF) {
        bytes.push(0xC0 | (codePoint >> 6));
        bytes.push(0x80 | (codePoint & 0x3F));
      } else if (codePoint <= 0xFFFF) {
        bytes.push(0xE0 | (codePoint >> 12));
        bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
        bytes.push(0x80 | (codePoint & 0x3F));
      } else {
        bytes.push(0xF0 | (codePoint >> 18));
        bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
        bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
        bytes.push(0x80 | (codePoint & 0x3F));
      }
    }
    return new Uint8Array(bytes);
  }

  static decode(bytes) {
    let result = '';
    let i = 0;
    
    while (i < bytes.length) {
      let codePoint;
      const byte1 = bytes[i];
      
      if ((byte1 & 0x80) === 0) {
        codePoint = byte1;
        i += 1;
      } else if ((byte1 & 0xE0) === 0xC0) {
        codePoint = ((byte1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
        i += 2;
      } else if ((byte1 & 0xF0) === 0xE0) {
        codePoint = ((byte1 & 0x0F) << 12) | 
                    ((bytes[i + 1] & 0x3F) << 6) | 
                    (bytes[i + 2] & 0x3F);
        i += 3;
      } else {
        codePoint = ((byte1 & 0x07) << 18) | 
                    ((bytes[i + 1] & 0x3F) << 12) | 
                    ((bytes[i + 2] & 0x3F) << 6) | 
                    (bytes[i + 3] & 0x3F);
        i += 4;
      }
      
      result += String.fromCodePoint(codePoint);
    }
    
    return result;
  }

  static toHexString(bytes) {
    return Array.from(bytes)
      .map(b => b.toString(16).toUpperCase().padStart(2, '0'))
      .join(' ');
  }

  static fromHexString(hexStr) {
    const bytes = hexStr.split(/\s+/)
      .filter(s => s.length > 0)
      .map(s => parseInt(s, 16));
    return new Uint8Array(bytes);
  }
}

// 使用示例
const encoded = UTF8Codec.encode('你好世界');
console.log(UTF8Codec.toHexString(encoded));
// "E4 BD A0 E5 A5 BD E4 B8 96 E7 95 8C"

const decoded = UTF8Codec.decode(encoded);
console.log(decoded);
// "你好世界"

HTML 实体编码

什么是 HTML 实体？

HTML 实体是用于在 HTML 中表示特殊字符的编码方式：

html

<!-- 命名实体 -->
&lt;    → <
&gt;    → >
&amp;   → &
&quot;  → "
&nbsp;  → 不换行空格

<!-- 数字实体 -->
&#60;   → < (十进制)
&#x3C;  → < (十六进制)

为什么需要 HTML 实体？

避免解析错误：< 和 > 会被解析为标签
防止 XSS 攻击：转义用户输入
显示特殊字符：版权符号 ©、商标 ™ 等

HTML 实体编码实现

javascript

class HTMLEntityEncoder {
  static namedEntities = {
    '&': '&amp;',
    '<': '&lt;',
    '>': '&gt;',
    '"': '&quot;',
    "'": '&#39;',
    '/': '&#x2F;',
    '`': '&#x60;',
    '=': '&#x3D;'
  };

  static reverseEntities = {
    'amp': '&',
    'lt': '<',
    'gt': '>',
    'quot': '"',
    'apos': "'",
    'nbsp': '\u00A0',
    'copy': '©',
    'reg': '®',
    'trade': '™',
    'euro': '€',
    'pound': '£',
    'yen': '¥',
    'cent': '¢'
  };

  static encode(str, options = {}) {
    const { mode = 'named', encodeAll = false } = options;
    
    return str.replace(/[&<>"'`=\/]|[^\x00-\x7F]/g, char => {
      if (this.namedEntities[char]) {
        return this.namedEntities[char];
      }
      
      if (encodeAll || char.charCodeAt(0) > 127) {
        const code = char.codePointAt(0);
        return mode === 'hex' 
          ? `&#x${code.toString(16).toUpperCase()};`
          : `&#${code};`;
      }
      
      return char;
    });
  }

  static decode(str) {
    return str
      .replace(/&([a-zA-Z]+);/g, (match, name) => {
        return this.reverseEntities[name.toLowerCase()] || match;
      })
      .replace(/&#(\d+);/g, (match, code) => {
        return String.fromCodePoint(parseInt(code, 10));
      })
      .replace(/&#x([0-9A-Fa-f]+);/g, (match, code) => {
        return String.fromCodePoint(parseInt(code, 16));
      });
  }

  static encodeForAttribute(str) {
    return str.replace(/[&<>"']/g, char => this.namedEntities[char]);
  }

  static encodeForHTML(str) {
    return str.replace(/[&<>]/g, char => this.namedEntities[char]);
  }

  static encodeNonASCII(str) {
    return str.replace(/[^\x00-\x7F]/g, char => {
      return `&#${char.codePointAt(0)};`;
    });
  }
}

// 使用示例
console.log(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'));
// "&lt;script&gt;alert(&quot;XSS&quot;)&lt;/script&gt;"

console.log(HTMLEntityEncoder.encode('版权 © 2024', { encodeAll: true }));
// "&#29256;&#26435; &#169; 2024"

console.log(HTMLEntityEncoder.decode('&lt;div&gt;Hello&lt;/div&gt;'));
// "<div>Hello</div>"

Python HTML 实体实现

python

import html
import re

class HTMLEntityEncoder:
    @staticmethod
    def encode(s: str, quote: bool = True) -> str:
        return html.escape(s, quote=quote)
    
    @staticmethod
    def decode(s: str) -> str:
        return html.unescape(s)
    
    @staticmethod
    def encode_non_ascii(s: str) -> str:
        return ''.join(
            f'&#{ord(c)};' if ord(c) > 127 else c
            for c in s
        )
    
    @staticmethod
    def encode_all(s: str, use_hex: bool = False) -> str:
        if use_hex:
            return ''.join(f'&#x{ord(c):X};' for c in s)
        return ''.join(f'&#{ord(c)};' for c in s)

# 使用示例
print(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'))
# &lt;script&gt;alert(&quot;XSS&quot;)&lt;/script&gt;

print(HTMLEntityEncoder.decode('&lt;div&gt;'))
# <div>

常用 HTML 实体对照表

字符	命名实体	十进制	十六进制	描述
<	`<`	`<`	`<`	小于号
>	`>`	`>`	`>`	大于号
&	`&`	`&`	`&`	和号
"	`"`	`"`	`"`	双引号
'	`'`	`'`	`'`	单引号
©	`©`	`©`	`©`	版权
®	`®`	`®`	`®`	注册商标
™	`™`	`™`	`™`	商标
€	`€`	`€`	`€`	欧元
£	`£`	`£`	`£`	英镑
¥	`¥`	`¥`	`¥`	日元/人民币
	` `	` `	` `	不换行空格

URL 编码

URL 编码原理

URL 编码（Percent-encoding）用于在 URL 中安全传输特殊字符：

code

空格 → %20 或 +
中文 → UTF-8 字节的十六进制

URL 编码实现

javascript

class URLEncoder {
  static encode(str) {
    return encodeURIComponent(str);
  }

  static decode(str) {
    return decodeURIComponent(str);
  }

  static encodeURI(str) {
    return encodeURI(str);
  }

  static decodeURI(str) {
    return decodeURI(str);
  }

  static encodeQueryParam(params) {
    return Object.entries(params)
      .map(([key, value]) => 
        `${encodeURIComponent(key)}=${encodeURIComponent(value)}`
      )
      .join('&');
  }

  static decodeQueryParam(queryString) {
    const params = {};
    const pairs = queryString.replace(/^\?/, '').split('&');
    
    for (const pair of pairs) {
      const [key, value] = pair.split('=');
      params[decodeURIComponent(key)] = decodeURIComponent(value || '');
    }
    
    return params;
  }

  static encodeRFC3986(str) {
    return encodeURIComponent(str)
      .replace(/[!'()*]/g, c => `%${c.charCodeAt(0).toString(16).toUpperCase()}`);
  }
}

// 使用示例
console.log(URLEncoder.encode('你好 世界'));
// "%E4%BD%A0%E5%A5%BD%20%E4%B8%96%E7%95%8C"

console.log(URLEncoder.encodeQueryParam({
  name: '张三',
  message: 'Hello World!'
}));
// "name=%E5%BC%A0%E4%B8%89&message=Hello%20World!"

实际应用场景

1. XSS 防护

javascript

function sanitizeHTML(input) {
  return HTMLEntityEncoder.encode(input);
}

function createSafeElement(tag, text) {
  const element = document.createElement(tag);
  element.textContent = text;  // 自动转义
  return element;
}

// 不安全的做法
element.innerHTML = userInput;  // ❌ XSS 风险

// 安全的做法
element.textContent = userInput;  // ✅ 自动转义
element.innerHTML = sanitizeHTML(userInput);  // ✅ 手动转义

2. 国际化文本处理

javascript

function normalizeText(str) {
  // NFD: 分解
  // NFC: 组合
  // NFKD: 兼容分解
  // NFKC: 兼容组合
  return str.normalize('NFC');
}

function compareStrings(a, b, locale = 'zh-CN') {
  return a.localeCompare(b, locale);
}

// 处理全角/半角转换
function toHalfWidth(str) {
  return str.replace(/[\uFF01-\uFF5E]/g, char => 
    String.fromCharCode(char.charCodeAt(0) - 0xFEE0)
  ).replace(/\u3000/g, ' ');
}

function toFullWidth(str) {
  return str.replace(/[\x21-\x7E]/g, char =>
    String.fromCharCode(char.charCodeAt(0) + 0xFEE0)
  ).replace(/ /g, '\u3000');
}

3. 文件编码检测与转换

javascript

async function detectEncoding(file) {
  const buffer = await file.arrayBuffer();
  const bytes = new Uint8Array(buffer);
  
  // 检测 BOM
  if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) {
    return 'UTF-8';
  }
  if (bytes[0] === 0xFF && bytes[1] === 0xFE) {
    return 'UTF-16LE';
  }
  if (bytes[0] === 0xFE && bytes[1] === 0xFF) {
    return 'UTF-16BE';
  }
  
  // 尝试 UTF-8 解码
  try {
    new TextDecoder('utf-8', { fatal: true }).decode(bytes);
    return 'UTF-8';
  } catch {
    return 'unknown';
  }
}

async function convertEncoding(file, fromEncoding, toEncoding) {
  const buffer = await file.arrayBuffer();
  const decoder = new TextDecoder(fromEncoding);
  const text = decoder.decode(buffer);
  
  const encoder = new TextEncoder();  // 总是 UTF-8
  return encoder.encode(text);
}

4. 邮件编码

javascript

function encodeQuotedPrintable(str) {
  return str.replace(/[^\x20-\x7E]|=/g, char => {
    const code = char.charCodeAt(0);
    return `=${code.toString(16).toUpperCase().padStart(2, '0')}`;
  });
}

function encodeBase64MIME(str) {
  const base64 = btoa(unescape(encodeURIComponent(str)));
  return `=?UTF-8?B?${base64}?=`;
}

// 使用示例
console.log(encodeBase64MIME('你好'));
// "=?UTF-8?B?5L2g5aW9?="

常见问题与解决方案

1. 乱码问题

javascript

// 问题：UTF-8 文件用 GBK 打开
// 解决：指定正确编码
const decoder = new TextDecoder('utf-8');
const text = decoder.decode(buffer);

// 问题：数据库乱码
// 解决：确保连接编码一致
// SET NAMES utf8mb4;

2. Emoji 处理

javascript

// 问题：Emoji 长度计算错误
'👨‍👩‍👧‍👦'.length;  // 11 (错误)

// 解决：使用 spread 或 Array.from
[...'👨‍👩‍👧‍👦'].length;  // 7 (ZWJ 序列)

// 获取真实字符数
function getCharacterCount(str) {
  const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
  return [...segmenter.segment(str)].length;
}

3. 代理对问题

javascript

// 问题：BMP 外字符处理
const emoji = '😀';
emoji.length;  // 2 (代理对)
emoji.charCodeAt(0);  // 55357 (高代理)
emoji.charCodeAt(1);  // 56832 (低代理)

// 解决：使用 codePointAt
emoji.codePointAt(0);  // 128512 (正确码点)
String.fromCodePoint(128512);  // '😀'

总结

字符编码是计算机处理文本的基础，核心要点：

ASCII：基础编码，仅支持英文
Unicode：统一字符集，为所有字符分配码点
UTF-8：变长编码，兼容 ASCII，最广泛使用
HTML 实体：在 HTML 中安全显示特殊字符
URL 编码：在 URL 中安全传输特殊字符

如需快速进行编码转换，可以使用我们的在线工具：

HTML 实体编码工具 - HTML 实体编解码
ASCII Unicode 转换器 - 字符编码转换
URL 编码工具 - URL 编解码

文本编码详解：HTML实体、ASCII、Unicode与字符编码原理

字符编码基础

为什么需要字符编码？

编码发展历程

ASCII 编码

ASCII 基础

ASCII 码表

ASCII 转换实现

Python ASCII 实现

Unicode 编码

Unicode 基础

Unicode 平面

Unicode 转换实现

Python Unicode 实现

UTF-8 编码

UTF-8 原理

UTF-8 编码示例

UTF-8 编解码实现

HTML 实体编码

什么是 HTML 实体？

为什么需要 HTML 实体？

HTML 实体编码实现

Python HTML 实体实现

常用 HTML 实体对照表

URL 编码

URL 编码原理

URL 编码实现

实际应用场景

1. XSS 防护

2. 国际化文本处理

3. 文件编码检测与转换

4. 邮件编码

常见问题与解决方案

1. 乱码问题

2. Emoji 处理

3. 代理对问题

总结

相关资源