字符编码是计算机处理文本的基础。理解编码原理不仅能帮助解决乱码问题,还能在 Web 安全、国际化开发中发挥重要作用。本文将深入讲解各种编码方式的原理和实现。

字符编码基础

为什么需要字符编码?

计算机只能处理数字(二进制),而人类使用文字。字符编码就是建立字符与数字之间的映射关系:

字符 'A' → 数字 65 → 二进制 01000001
字符 '中' → 数字 20013 → 二进制 ...

编码发展历程

ASCII (1963) → 扩展ASCII → ISO-8859 → Unicode (1991) → UTF-8/UTF-16
     ↓              ↓           ↓            ↓
   7位/128字符   8位/256字符   区域编码    统一编码

ASCII 编码

ASCII 基础

ASCII(American Standard Code for Information Interchange)是最基础的字符编码:

  • 范围:0-127(7位)
  • 字符数:128个
  • 包含:英文字母、数字、标点、控制字符

ASCII 码表

范围 类型 示例
0-31 控制字符 NUL, TAB, LF, CR
32-47 标点符号 空格, !, ", #
48-57 数字 0-9
65-90 大写字母 A-Z
97-122 小写字母 a-z
123-127 其他符号 {,

ASCII 转换实现

class ASCIIConverter {
  static charToCode(char) {
    return char.charCodeAt(0);
  }

  static codeToChar(code) {
    return String.fromCharCode(code);
  }

  static stringToASCII(str) {
    return Array.from(str).map(char => ({
      char,
      decimal: char.charCodeAt(0),
      hex: char.charCodeAt(0).toString(16).toUpperCase(),
      binary: char.charCodeAt(0).toString(2).padStart(8, '0')
    }));
  }

  static asciiToString(codes) {
    return codes.map(code => String.fromCharCode(code)).join('');
  }

  static isASCII(str) {
    return /^[\x00-\x7F]*$/.test(str);
  }

  static toUpperCase(char) {
    const code = char.charCodeAt(0);
    if (code >= 97 && code <= 122) {
      return String.fromCharCode(code - 32);
    }
    return char;
  }

  static toLowerCase(char) {
    const code = char.charCodeAt(0);
    if (code >= 65 && code <= 90) {
      return String.fromCharCode(code + 32);
    }
    return char;
  }
}

// 使用示例
console.log(ASCIIConverter.stringToASCII('Hello'));
// [
//   { char: 'H', decimal: 72, hex: '48', binary: '01001000' },
//   { char: 'e', decimal: 101, hex: '65', binary: '01100101' },
//   ...
// ]

console.log(ASCIIConverter.asciiToString([72, 101, 108, 108, 111]));
// "Hello"

Python ASCII 实现

class ASCIIConverter:
    @staticmethod
    def char_to_code(char: str) -> int:
        return ord(char)
    
    @staticmethod
    def code_to_char(code: int) -> str:
        return chr(code)
    
    @staticmethod
    def string_to_ascii(s: str) -> list:
        return [
            {
                'char': char,
                'decimal': ord(char),
                'hex': hex(ord(char))[2:].upper(),
                'binary': bin(ord(char))[2:].zfill(8)
            }
            for char in s
        ]
    
    @staticmethod
    def ascii_to_string(codes: list) -> str:
        return ''.join(chr(code) for code in codes)
    
    @staticmethod
    def is_ascii(s: str) -> bool:
        return all(ord(char) < 128 for char in s)

# 使用示例
print(ASCIIConverter.string_to_ascii('Hello'))
print(ASCIIConverter.ascii_to_string([72, 101, 108, 108, 111]))

Unicode 编码

Unicode 基础

Unicode 是一个字符集标准,为世界上所有字符分配唯一的码点(Code Point):

  • 范围:U+0000 到 U+10FFFF
  • 字符数:超过 140,000 个
  • 表示:U+XXXX(十六进制)

Unicode 平面

平面 范围 名称 内容
0 U+0000-U+FFFF BMP 常用字符
1 U+10000-U+1FFFF SMP Emoji、古文字
2 U+20000-U+2FFFF SIP 扩展汉字
14 U+E0000-U+EFFFF SSP 特殊用途

Unicode 转换实现

class UnicodeConverter {
  static charToCodePoint(char) {
    return char.codePointAt(0);
  }

  static codePointToChar(codePoint) {
    return String.fromCodePoint(codePoint);
  }

  static stringToUnicode(str) {
    const result = [];
    for (const char of str) {
      const codePoint = char.codePointAt(0);
      result.push({
        char,
        codePoint,
        unicode: `U+${codePoint.toString(16).toUpperCase().padStart(4, '0')}`,
        utf8: this.toUTF8Bytes(codePoint),
        utf16: this.toUTF16(codePoint)
      });
    }
    return result;
  }

  static toUTF8Bytes(codePoint) {
    const bytes = [];
    if (codePoint <= 0x7F) {
      bytes.push(codePoint);
    } else if (codePoint <= 0x7FF) {
      bytes.push(0xC0 | (codePoint >> 6));
      bytes.push(0x80 | (codePoint & 0x3F));
    } else if (codePoint <= 0xFFFF) {
      bytes.push(0xE0 | (codePoint >> 12));
      bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
      bytes.push(0x80 | (codePoint & 0x3F));
    } else {
      bytes.push(0xF0 | (codePoint >> 18));
      bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
      bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
      bytes.push(0x80 | (codePoint & 0x3F));
    }
    return bytes.map(b => b.toString(16).toUpperCase().padStart(2, '0'));
  }

  static toUTF16(codePoint) {
    if (codePoint <= 0xFFFF) {
      return [codePoint.toString(16).toUpperCase().padStart(4, '0')];
    }
    // 代理对
    const offset = codePoint - 0x10000;
    const high = 0xD800 + (offset >> 10);
    const low = 0xDC00 + (offset & 0x3FF);
    return [
      high.toString(16).toUpperCase(),
      low.toString(16).toUpperCase()
    ];
  }

  static escapeUnicode(str) {
    return Array.from(str)
      .map(char => {
        const code = char.codePointAt(0);
        if (code > 0xFFFF) {
          return `\\u{${code.toString(16).toUpperCase()}}`;
        }
        return `\\u${code.toString(16).toUpperCase().padStart(4, '0')}`;
      })
      .join('');
  }

  static unescapeUnicode(str) {
    return str.replace(/\\u\{([0-9A-Fa-f]+)\}|\\u([0-9A-Fa-f]{4})/g, 
      (match, p1, p2) => {
        const codePoint = parseInt(p1 || p2, 16);
        return String.fromCodePoint(codePoint);
      }
    );
  }
}

// 使用示例
console.log(UnicodeConverter.stringToUnicode('你好👋'));
// [
//   { char: '你', codePoint: 20320, unicode: 'U+4F60', utf8: ['E4', 'BD', 'A0'], ... },
//   { char: '好', codePoint: 22909, unicode: 'U+597D', utf8: ['E5', 'A5', 'BD'], ... },
//   { char: '👋', codePoint: 128075, unicode: 'U+1F44B', utf8: ['F0', '9F', '91', '8B'], ... }
// ]

console.log(UnicodeConverter.escapeUnicode('Hello 世界'));
// "\u0048\u0065\u006C\u006C\u006F\u0020\u4E16\u754C"

Python Unicode 实现

class UnicodeConverter:
    @staticmethod
    def char_to_codepoint(char: str) -> int:
        return ord(char)
    
    @staticmethod
    def codepoint_to_char(codepoint: int) -> str:
        return chr(codepoint)
    
    @staticmethod
    def string_to_unicode(s: str) -> list:
        result = []
        for char in s:
            codepoint = ord(char)
            result.append({
                'char': char,
                'codepoint': codepoint,
                'unicode': f'U+{codepoint:04X}',
                'utf8': s.encode('utf-8').hex().upper(),
                'utf16': char.encode('utf-16-be').hex().upper()
            })
        return result
    
    @staticmethod
    def escape_unicode(s: str) -> str:
        return ''.join(f'\\u{ord(c):04X}' if ord(c) <= 0xFFFF 
                       else f'\\U{ord(c):08X}' for c in s)
    
    @staticmethod
    def unescape_unicode(s: str) -> str:
        return s.encode().decode('unicode_escape')

# 使用示例
print(UnicodeConverter.string_to_unicode('你好'))
print(UnicodeConverter.escape_unicode('Hello 世界'))

UTF-8 编码

UTF-8 原理

UTF-8 是 Unicode 的一种变长编码方式:

Unicode 范围 UTF-8 字节数 编码格式
U+0000-U+007F 1 0xxxxxxx
U+0080-U+07FF 2 110xxxxx 10xxxxxx
U+0800-U+FFFF 3 1110xxxx 10xxxxxx 10xxxxxx
U+10000-U+10FFFF 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

UTF-8 编码示例

以汉字"中"(U+4E2D)为例:

1. 码点:0x4E2D = 0100 1110 0010 1101
2. 范围:U+0800-U+FFFF,需要3字节
3. 模板:1110xxxx 10xxxxxx 10xxxxxx
4. 填充:
   - 1110 0100 (E4)
   - 10 111000 (B8)
   - 10 101101 (AD)
5. 结果:E4 B8 AD

UTF-8 编解码实现

class UTF8Codec {
  static encode(str) {
    const bytes = [];
    for (const char of str) {
      const codePoint = char.codePointAt(0);
      
      if (codePoint <= 0x7F) {
        bytes.push(codePoint);
      } else if (codePoint <= 0x7FF) {
        bytes.push(0xC0 | (codePoint >> 6));
        bytes.push(0x80 | (codePoint & 0x3F));
      } else if (codePoint <= 0xFFFF) {
        bytes.push(0xE0 | (codePoint >> 12));
        bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
        bytes.push(0x80 | (codePoint & 0x3F));
      } else {
        bytes.push(0xF0 | (codePoint >> 18));
        bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
        bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
        bytes.push(0x80 | (codePoint & 0x3F));
      }
    }
    return new Uint8Array(bytes);
  }

  static decode(bytes) {
    let result = '';
    let i = 0;
    
    while (i < bytes.length) {
      let codePoint;
      const byte1 = bytes[i];
      
      if ((byte1 & 0x80) === 0) {
        codePoint = byte1;
        i += 1;
      } else if ((byte1 & 0xE0) === 0xC0) {
        codePoint = ((byte1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
        i += 2;
      } else if ((byte1 & 0xF0) === 0xE0) {
        codePoint = ((byte1 & 0x0F) << 12) | 
                    ((bytes[i + 1] & 0x3F) << 6) | 
                    (bytes[i + 2] & 0x3F);
        i += 3;
      } else {
        codePoint = ((byte1 & 0x07) << 18) | 
                    ((bytes[i + 1] & 0x3F) << 12) | 
                    ((bytes[i + 2] & 0x3F) << 6) | 
                    (bytes[i + 3] & 0x3F);
        i += 4;
      }
      
      result += String.fromCodePoint(codePoint);
    }
    
    return result;
  }

  static toHexString(bytes) {
    return Array.from(bytes)
      .map(b => b.toString(16).toUpperCase().padStart(2, '0'))
      .join(' ');
  }

  static fromHexString(hexStr) {
    const bytes = hexStr.split(/\s+/)
      .filter(s => s.length > 0)
      .map(s => parseInt(s, 16));
    return new Uint8Array(bytes);
  }
}

// 使用示例
const encoded = UTF8Codec.encode('你好世界');
console.log(UTF8Codec.toHexString(encoded));
// "E4 BD A0 E5 A5 BD E4 B8 96 E7 95 8C"

const decoded = UTF8Codec.decode(encoded);
console.log(decoded);
// "你好世界"

HTML 实体编码

什么是 HTML 实体?

HTML 实体是用于在 HTML 中表示特殊字符的编码方式:

<!-- 命名实体 -->
&lt;    → <
&gt;    → >
&amp;   → &
&quot;  → "
&nbsp;  → 不换行空格

<!-- 数字实体 -->
&#60;   → < (十进制)
&#x3C;  → < (十六进制)

为什么需要 HTML 实体?

  1. 避免解析错误<> 会被解析为标签
  2. 防止 XSS 攻击:转义用户输入
  3. 显示特殊字符:版权符号 ©、商标 ™ 等

HTML 实体编码实现

class HTMLEntityEncoder {
  static namedEntities = {
    '&': '&amp;',
    '<': '&lt;',
    '>': '&gt;',
    '"': '&quot;',
    "'": '&#39;',
    '/': '&#x2F;',
    '`': '&#x60;',
    '=': '&#x3D;'
  };

  static reverseEntities = {
    'amp': '&',
    'lt': '<',
    'gt': '>',
    'quot': '"',
    'apos': "'",
    'nbsp': '\u00A0',
    'copy': '©',
    'reg': '®',
    'trade': '™',
    'euro': '€',
    'pound': '£',
    'yen': '¥',
    'cent': '¢'
  };

  static encode(str, options = {}) {
    const { mode = 'named', encodeAll = false } = options;
    
    return str.replace(/[&<>"'`=\/]|[^\x00-\x7F]/g, char => {
      if (this.namedEntities[char]) {
        return this.namedEntities[char];
      }
      
      if (encodeAll || char.charCodeAt(0) > 127) {
        const code = char.codePointAt(0);
        return mode === 'hex' 
          ? `&#x${code.toString(16).toUpperCase()};`
          : `&#${code};`;
      }
      
      return char;
    });
  }

  static decode(str) {
    return str
      .replace(/&([a-zA-Z]+);/g, (match, name) => {
        return this.reverseEntities[name.toLowerCase()] || match;
      })
      .replace(/&#(\d+);/g, (match, code) => {
        return String.fromCodePoint(parseInt(code, 10));
      })
      .replace(/&#x([0-9A-Fa-f]+);/g, (match, code) => {
        return String.fromCodePoint(parseInt(code, 16));
      });
  }

  static encodeForAttribute(str) {
    return str.replace(/[&<>"']/g, char => this.namedEntities[char]);
  }

  static encodeForHTML(str) {
    return str.replace(/[&<>]/g, char => this.namedEntities[char]);
  }

  static encodeNonASCII(str) {
    return str.replace(/[^\x00-\x7F]/g, char => {
      return `&#${char.codePointAt(0)};`;
    });
  }
}

// 使用示例
console.log(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'));
// "&lt;script&gt;alert(&quot;XSS&quot;)&lt;/script&gt;"

console.log(HTMLEntityEncoder.encode('版权 © 2024', { encodeAll: true }));
// "&#29256;&#26435; &#169; 2024"

console.log(HTMLEntityEncoder.decode('&lt;div&gt;Hello&lt;/div&gt;'));
// "<div>Hello</div>"

Python HTML 实体实现

import html
import re

class HTMLEntityEncoder:
    @staticmethod
    def encode(s: str, quote: bool = True) -> str:
        return html.escape(s, quote=quote)
    
    @staticmethod
    def decode(s: str) -> str:
        return html.unescape(s)
    
    @staticmethod
    def encode_non_ascii(s: str) -> str:
        return ''.join(
            f'&#{ord(c)};' if ord(c) > 127 else c
            for c in s
        )
    
    @staticmethod
    def encode_all(s: str, use_hex: bool = False) -> str:
        if use_hex:
            return ''.join(f'&#x{ord(c):X};' for c in s)
        return ''.join(f'&#{ord(c)};' for c in s)

# 使用示例
print(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'))
# &lt;script&gt;alert(&quot;XSS&quot;)&lt;/script&gt;

print(HTMLEntityEncoder.decode('&lt;div&gt;'))
# <div>

常用 HTML 实体对照表

字符 命名实体 十进制 十六进制 描述
< &lt; &#60; &#x3C; 小于号
> &gt; &#62; &#x3E; 大于号
& &amp; &#38; &#x26; 和号
" &quot; &#34; &#x22; 双引号
' &apos; &#39; &#x27; 单引号
© &copy; &#169; &#xA9; 版权
® &reg; &#174; &#xAE; 注册商标
&trade; &#8482; &#x2122; 商标
&euro; &#8364; &#x20AC; 欧元
£ &pound; &#163; &#xA3; 英镑
¥ &yen; &#165; &#xA5; 日元/人民币
&nbsp; &#160; &#xA0; 不换行空格

URL 编码

URL 编码原理

URL 编码(Percent-encoding)用于在 URL 中安全传输特殊字符:

空格 → %20 或 +
中文 → UTF-8 字节的十六进制

URL 编码实现

class URLEncoder {
  static encode(str) {
    return encodeURIComponent(str);
  }

  static decode(str) {
    return decodeURIComponent(str);
  }

  static encodeURI(str) {
    return encodeURI(str);
  }

  static decodeURI(str) {
    return decodeURI(str);
  }

  static encodeQueryParam(params) {
    return Object.entries(params)
      .map(([key, value]) => 
        `${encodeURIComponent(key)}=${encodeURIComponent(value)}`
      )
      .join('&');
  }

  static decodeQueryParam(queryString) {
    const params = {};
    const pairs = queryString.replace(/^\?/, '').split('&');
    
    for (const pair of pairs) {
      const [key, value] = pair.split('=');
      params[decodeURIComponent(key)] = decodeURIComponent(value || '');
    }
    
    return params;
  }

  static encodeRFC3986(str) {
    return encodeURIComponent(str)
      .replace(/[!'()*]/g, c => `%${c.charCodeAt(0).toString(16).toUpperCase()}`);
  }
}

// 使用示例
console.log(URLEncoder.encode('你好 世界'));
// "%E4%BD%A0%E5%A5%BD%20%E4%B8%96%E7%95%8C"

console.log(URLEncoder.encodeQueryParam({
  name: '张三',
  message: 'Hello World!'
}));
// "name=%E5%BC%A0%E4%B8%89&message=Hello%20World!"

实际应用场景

1. XSS 防护

function sanitizeHTML(input) {
  return HTMLEntityEncoder.encode(input);
}

function createSafeElement(tag, text) {
  const element = document.createElement(tag);
  element.textContent = text;  // 自动转义
  return element;
}

// 不安全的做法
element.innerHTML = userInput;  // ❌ XSS 风险

// 安全的做法
element.textContent = userInput;  // ✅ 自动转义
element.innerHTML = sanitizeHTML(userInput);  // ✅ 手动转义

2. 国际化文本处理

function normalizeText(str) {
  // NFD: 分解
  // NFC: 组合
  // NFKD: 兼容分解
  // NFKC: 兼容组合
  return str.normalize('NFC');
}

function compareStrings(a, b, locale = 'zh-CN') {
  return a.localeCompare(b, locale);
}

// 处理全角/半角转换
function toHalfWidth(str) {
  return str.replace(/[\uFF01-\uFF5E]/g, char => 
    String.fromCharCode(char.charCodeAt(0) - 0xFEE0)
  ).replace(/\u3000/g, ' ');
}

function toFullWidth(str) {
  return str.replace(/[\x21-\x7E]/g, char =>
    String.fromCharCode(char.charCodeAt(0) + 0xFEE0)
  ).replace(/ /g, '\u3000');
}

3. 文件编码检测与转换

async function detectEncoding(file) {
  const buffer = await file.arrayBuffer();
  const bytes = new Uint8Array(buffer);
  
  // 检测 BOM
  if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) {
    return 'UTF-8';
  }
  if (bytes[0] === 0xFF && bytes[1] === 0xFE) {
    return 'UTF-16LE';
  }
  if (bytes[0] === 0xFE && bytes[1] === 0xFF) {
    return 'UTF-16BE';
  }
  
  // 尝试 UTF-8 解码
  try {
    new TextDecoder('utf-8', { fatal: true }).decode(bytes);
    return 'UTF-8';
  } catch {
    return 'unknown';
  }
}

async function convertEncoding(file, fromEncoding, toEncoding) {
  const buffer = await file.arrayBuffer();
  const decoder = new TextDecoder(fromEncoding);
  const text = decoder.decode(buffer);
  
  const encoder = new TextEncoder();  // 总是 UTF-8
  return encoder.encode(text);
}

4. 邮件编码

function encodeQuotedPrintable(str) {
  return str.replace(/[^\x20-\x7E]|=/g, char => {
    const code = char.charCodeAt(0);
    return `=${code.toString(16).toUpperCase().padStart(2, '0')}`;
  });
}

function encodeBase64MIME(str) {
  const base64 = btoa(unescape(encodeURIComponent(str)));
  return `=?UTF-8?B?${base64}?=`;
}

// 使用示例
console.log(encodeBase64MIME('你好'));
// "=?UTF-8?B?5L2g5aW9?="

常见问题与解决方案

1. 乱码问题

// 问题:UTF-8 文件用 GBK 打开
// 解决:指定正确编码
const decoder = new TextDecoder('utf-8');
const text = decoder.decode(buffer);

// 问题:数据库乱码
// 解决:确保连接编码一致
// SET NAMES utf8mb4;

2. Emoji 处理

// 问题:Emoji 长度计算错误
'👨‍👩‍👧‍👦'.length;  // 11 (错误)

// 解决:使用 spread 或 Array.from
[...'👨‍👩‍👧‍👦'].length;  // 7 (ZWJ 序列)

// 获取真实字符数
function getCharacterCount(str) {
  const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
  return [...segmenter.segment(str)].length;
}

3. 代理对问题

// 问题:BMP 外字符处理
const emoji = '😀';
emoji.length;  // 2 (代理对)
emoji.charCodeAt(0);  // 55357 (高代理)
emoji.charCodeAt(1);  // 56832 (低代理)

// 解决:使用 codePointAt
emoji.codePointAt(0);  // 128512 (正确码点)
String.fromCodePoint(128512);  // '😀'

总结

字符编码是计算机处理文本的基础,核心要点:

  1. ASCII:基础编码,仅支持英文
  2. Unicode:统一字符集,为所有字符分配码点
  3. UTF-8:变长编码,兼容 ASCII,最广泛使用
  4. HTML 实体:在 HTML 中安全显示特殊字符
  5. URL 编码:在 URL 中安全传输特殊字符

如需快速进行编码转换,可以使用我们的在线工具:

相关资源