字符编码是计算机处理文本的基础。理解编码原理不仅能帮助解决乱码问题,还能在 Web 安全、国际化开发中发挥重要作用。本文将深入讲解各种编码方式的原理和实现。
字符编码基础
为什么需要字符编码?
计算机只能处理数字(二进制),而人类使用文字。字符编码就是建立字符与数字之间的映射关系:
字符 'A' → 数字 65 → 二进制 01000001
字符 '中' → 数字 20013 → 二进制 ...
编码发展历程
ASCII (1963) → 扩展ASCII → ISO-8859 → Unicode (1991) → UTF-8/UTF-16
↓ ↓ ↓ ↓
7位/128字符 8位/256字符 区域编码 统一编码
ASCII 编码
ASCII 基础
ASCII(American Standard Code for Information Interchange)是最基础的字符编码:
- 范围:0-127(7位)
- 字符数:128个
- 包含:英文字母、数字、标点、控制字符
ASCII 码表
| 范围 | 类型 | 示例 |
|---|---|---|
| 0-31 | 控制字符 | NUL, TAB, LF, CR |
| 32-47 | 标点符号 | 空格, !, ", # |
| 48-57 | 数字 | 0-9 |
| 65-90 | 大写字母 | A-Z |
| 97-122 | 小写字母 | a-z |
| 123-127 | 其他符号 | {, |
ASCII 转换实现
class ASCIIConverter {
static charToCode(char) {
return char.charCodeAt(0);
}
static codeToChar(code) {
return String.fromCharCode(code);
}
static stringToASCII(str) {
return Array.from(str).map(char => ({
char,
decimal: char.charCodeAt(0),
hex: char.charCodeAt(0).toString(16).toUpperCase(),
binary: char.charCodeAt(0).toString(2).padStart(8, '0')
}));
}
static asciiToString(codes) {
return codes.map(code => String.fromCharCode(code)).join('');
}
static isASCII(str) {
return /^[\x00-\x7F]*$/.test(str);
}
static toUpperCase(char) {
const code = char.charCodeAt(0);
if (code >= 97 && code <= 122) {
return String.fromCharCode(code - 32);
}
return char;
}
static toLowerCase(char) {
const code = char.charCodeAt(0);
if (code >= 65 && code <= 90) {
return String.fromCharCode(code + 32);
}
return char;
}
}
// 使用示例
console.log(ASCIIConverter.stringToASCII('Hello'));
// [
// { char: 'H', decimal: 72, hex: '48', binary: '01001000' },
// { char: 'e', decimal: 101, hex: '65', binary: '01100101' },
// ...
// ]
console.log(ASCIIConverter.asciiToString([72, 101, 108, 108, 111]));
// "Hello"
Python ASCII 实现
class ASCIIConverter:
@staticmethod
def char_to_code(char: str) -> int:
return ord(char)
@staticmethod
def code_to_char(code: int) -> str:
return chr(code)
@staticmethod
def string_to_ascii(s: str) -> list:
return [
{
'char': char,
'decimal': ord(char),
'hex': hex(ord(char))[2:].upper(),
'binary': bin(ord(char))[2:].zfill(8)
}
for char in s
]
@staticmethod
def ascii_to_string(codes: list) -> str:
return ''.join(chr(code) for code in codes)
@staticmethod
def is_ascii(s: str) -> bool:
return all(ord(char) < 128 for char in s)
# 使用示例
print(ASCIIConverter.string_to_ascii('Hello'))
print(ASCIIConverter.ascii_to_string([72, 101, 108, 108, 111]))
Unicode 编码
Unicode 基础
Unicode 是一个字符集标准,为世界上所有字符分配唯一的码点(Code Point):
- 范围:U+0000 到 U+10FFFF
- 字符数:超过 140,000 个
- 表示:U+XXXX(十六进制)
Unicode 平面
| 平面 | 范围 | 名称 | 内容 |
|---|---|---|---|
| 0 | U+0000-U+FFFF | BMP | 常用字符 |
| 1 | U+10000-U+1FFFF | SMP | Emoji、古文字 |
| 2 | U+20000-U+2FFFF | SIP | 扩展汉字 |
| 14 | U+E0000-U+EFFFF | SSP | 特殊用途 |
Unicode 转换实现
class UnicodeConverter {
static charToCodePoint(char) {
return char.codePointAt(0);
}
static codePointToChar(codePoint) {
return String.fromCodePoint(codePoint);
}
static stringToUnicode(str) {
const result = [];
for (const char of str) {
const codePoint = char.codePointAt(0);
result.push({
char,
codePoint,
unicode: `U+${codePoint.toString(16).toUpperCase().padStart(4, '0')}`,
utf8: this.toUTF8Bytes(codePoint),
utf16: this.toUTF16(codePoint)
});
}
return result;
}
static toUTF8Bytes(codePoint) {
const bytes = [];
if (codePoint <= 0x7F) {
bytes.push(codePoint);
} else if (codePoint <= 0x7FF) {
bytes.push(0xC0 | (codePoint >> 6));
bytes.push(0x80 | (codePoint & 0x3F));
} else if (codePoint <= 0xFFFF) {
bytes.push(0xE0 | (codePoint >> 12));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
} else {
bytes.push(0xF0 | (codePoint >> 18));
bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
}
return bytes.map(b => b.toString(16).toUpperCase().padStart(2, '0'));
}
static toUTF16(codePoint) {
if (codePoint <= 0xFFFF) {
return [codePoint.toString(16).toUpperCase().padStart(4, '0')];
}
// 代理对
const offset = codePoint - 0x10000;
const high = 0xD800 + (offset >> 10);
const low = 0xDC00 + (offset & 0x3FF);
return [
high.toString(16).toUpperCase(),
low.toString(16).toUpperCase()
];
}
static escapeUnicode(str) {
return Array.from(str)
.map(char => {
const code = char.codePointAt(0);
if (code > 0xFFFF) {
return `\\u{${code.toString(16).toUpperCase()}}`;
}
return `\\u${code.toString(16).toUpperCase().padStart(4, '0')}`;
})
.join('');
}
static unescapeUnicode(str) {
return str.replace(/\\u\{([0-9A-Fa-f]+)\}|\\u([0-9A-Fa-f]{4})/g,
(match, p1, p2) => {
const codePoint = parseInt(p1 || p2, 16);
return String.fromCodePoint(codePoint);
}
);
}
}
// 使用示例
console.log(UnicodeConverter.stringToUnicode('你好👋'));
// [
// { char: '你', codePoint: 20320, unicode: 'U+4F60', utf8: ['E4', 'BD', 'A0'], ... },
// { char: '好', codePoint: 22909, unicode: 'U+597D', utf8: ['E5', 'A5', 'BD'], ... },
// { char: '👋', codePoint: 128075, unicode: 'U+1F44B', utf8: ['F0', '9F', '91', '8B'], ... }
// ]
console.log(UnicodeConverter.escapeUnicode('Hello 世界'));
// "\u0048\u0065\u006C\u006C\u006F\u0020\u4E16\u754C"
Python Unicode 实现
class UnicodeConverter:
@staticmethod
def char_to_codepoint(char: str) -> int:
return ord(char)
@staticmethod
def codepoint_to_char(codepoint: int) -> str:
return chr(codepoint)
@staticmethod
def string_to_unicode(s: str) -> list:
result = []
for char in s:
codepoint = ord(char)
result.append({
'char': char,
'codepoint': codepoint,
'unicode': f'U+{codepoint:04X}',
'utf8': s.encode('utf-8').hex().upper(),
'utf16': char.encode('utf-16-be').hex().upper()
})
return result
@staticmethod
def escape_unicode(s: str) -> str:
return ''.join(f'\\u{ord(c):04X}' if ord(c) <= 0xFFFF
else f'\\U{ord(c):08X}' for c in s)
@staticmethod
def unescape_unicode(s: str) -> str:
return s.encode().decode('unicode_escape')
# 使用示例
print(UnicodeConverter.string_to_unicode('你好'))
print(UnicodeConverter.escape_unicode('Hello 世界'))
UTF-8 编码
UTF-8 原理
UTF-8 是 Unicode 的一种变长编码方式:
| Unicode 范围 | UTF-8 字节数 | 编码格式 |
|---|---|---|
| U+0000-U+007F | 1 | 0xxxxxxx |
| U+0080-U+07FF | 2 | 110xxxxx 10xxxxxx |
| U+0800-U+FFFF | 3 | 1110xxxx 10xxxxxx 10xxxxxx |
| U+10000-U+10FFFF | 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
UTF-8 编码示例
以汉字"中"(U+4E2D)为例:
1. 码点:0x4E2D = 0100 1110 0010 1101
2. 范围:U+0800-U+FFFF,需要3字节
3. 模板:1110xxxx 10xxxxxx 10xxxxxx
4. 填充:
- 1110 0100 (E4)
- 10 111000 (B8)
- 10 101101 (AD)
5. 结果:E4 B8 AD
UTF-8 编解码实现
class UTF8Codec {
static encode(str) {
const bytes = [];
for (const char of str) {
const codePoint = char.codePointAt(0);
if (codePoint <= 0x7F) {
bytes.push(codePoint);
} else if (codePoint <= 0x7FF) {
bytes.push(0xC0 | (codePoint >> 6));
bytes.push(0x80 | (codePoint & 0x3F));
} else if (codePoint <= 0xFFFF) {
bytes.push(0xE0 | (codePoint >> 12));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
} else {
bytes.push(0xF0 | (codePoint >> 18));
bytes.push(0x80 | ((codePoint >> 12) & 0x3F));
bytes.push(0x80 | ((codePoint >> 6) & 0x3F));
bytes.push(0x80 | (codePoint & 0x3F));
}
}
return new Uint8Array(bytes);
}
static decode(bytes) {
let result = '';
let i = 0;
while (i < bytes.length) {
let codePoint;
const byte1 = bytes[i];
if ((byte1 & 0x80) === 0) {
codePoint = byte1;
i += 1;
} else if ((byte1 & 0xE0) === 0xC0) {
codePoint = ((byte1 & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
i += 2;
} else if ((byte1 & 0xF0) === 0xE0) {
codePoint = ((byte1 & 0x0F) << 12) |
((bytes[i + 1] & 0x3F) << 6) |
(bytes[i + 2] & 0x3F);
i += 3;
} else {
codePoint = ((byte1 & 0x07) << 18) |
((bytes[i + 1] & 0x3F) << 12) |
((bytes[i + 2] & 0x3F) << 6) |
(bytes[i + 3] & 0x3F);
i += 4;
}
result += String.fromCodePoint(codePoint);
}
return result;
}
static toHexString(bytes) {
return Array.from(bytes)
.map(b => b.toString(16).toUpperCase().padStart(2, '0'))
.join(' ');
}
static fromHexString(hexStr) {
const bytes = hexStr.split(/\s+/)
.filter(s => s.length > 0)
.map(s => parseInt(s, 16));
return new Uint8Array(bytes);
}
}
// 使用示例
const encoded = UTF8Codec.encode('你好世界');
console.log(UTF8Codec.toHexString(encoded));
// "E4 BD A0 E5 A5 BD E4 B8 96 E7 95 8C"
const decoded = UTF8Codec.decode(encoded);
console.log(decoded);
// "你好世界"
HTML 实体编码
什么是 HTML 实体?
HTML 实体是用于在 HTML 中表示特殊字符的编码方式:
<!-- 命名实体 -->
< → <
> → >
& → &
" → "
→ 不换行空格
<!-- 数字实体 -->
< → < (十进制)
< → < (十六进制)
为什么需要 HTML 实体?
- 避免解析错误:
<和>会被解析为标签 - 防止 XSS 攻击:转义用户输入
- 显示特殊字符:版权符号 ©、商标 ™ 等
HTML 实体编码实现
class HTMLEntityEncoder {
static namedEntities = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": ''',
'/': '/',
'`': '`',
'=': '='
};
static reverseEntities = {
'amp': '&',
'lt': '<',
'gt': '>',
'quot': '"',
'apos': "'",
'nbsp': '\u00A0',
'copy': '©',
'reg': '®',
'trade': '™',
'euro': '€',
'pound': '£',
'yen': '¥',
'cent': '¢'
};
static encode(str, options = {}) {
const { mode = 'named', encodeAll = false } = options;
return str.replace(/[&<>"'`=\/]|[^\x00-\x7F]/g, char => {
if (this.namedEntities[char]) {
return this.namedEntities[char];
}
if (encodeAll || char.charCodeAt(0) > 127) {
const code = char.codePointAt(0);
return mode === 'hex'
? `&#x${code.toString(16).toUpperCase()};`
: `&#${code};`;
}
return char;
});
}
static decode(str) {
return str
.replace(/&([a-zA-Z]+);/g, (match, name) => {
return this.reverseEntities[name.toLowerCase()] || match;
})
.replace(/&#(\d+);/g, (match, code) => {
return String.fromCodePoint(parseInt(code, 10));
})
.replace(/&#x([0-9A-Fa-f]+);/g, (match, code) => {
return String.fromCodePoint(parseInt(code, 16));
});
}
static encodeForAttribute(str) {
return str.replace(/[&<>"']/g, char => this.namedEntities[char]);
}
static encodeForHTML(str) {
return str.replace(/[&<>]/g, char => this.namedEntities[char]);
}
static encodeNonASCII(str) {
return str.replace(/[^\x00-\x7F]/g, char => {
return `&#${char.codePointAt(0)};`;
});
}
}
// 使用示例
console.log(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'));
// "<script>alert("XSS")</script>"
console.log(HTMLEntityEncoder.encode('版权 © 2024', { encodeAll: true }));
// "版权 © 2024"
console.log(HTMLEntityEncoder.decode('<div>Hello</div>'));
// "<div>Hello</div>"
Python HTML 实体实现
import html
import re
class HTMLEntityEncoder:
@staticmethod
def encode(s: str, quote: bool = True) -> str:
return html.escape(s, quote=quote)
@staticmethod
def decode(s: str) -> str:
return html.unescape(s)
@staticmethod
def encode_non_ascii(s: str) -> str:
return ''.join(
f'&#{ord(c)};' if ord(c) > 127 else c
for c in s
)
@staticmethod
def encode_all(s: str, use_hex: bool = False) -> str:
if use_hex:
return ''.join(f'&#x{ord(c):X};' for c in s)
return ''.join(f'&#{ord(c)};' for c in s)
# 使用示例
print(HTMLEntityEncoder.encode('<script>alert("XSS")</script>'))
# <script>alert("XSS")</script>
print(HTMLEntityEncoder.decode('<div>'))
# <div>
常用 HTML 实体对照表
| 字符 | 命名实体 | 十进制 | 十六进制 | 描述 |
|---|---|---|---|---|
| < | < |
< |
< |
小于号 |
| > | > |
> |
> |
大于号 |
| & | & |
& |
& |
和号 |
| " | " |
" |
" |
双引号 |
| ' | ' |
' |
' |
单引号 |
| © | © |
© |
© |
版权 |
| ® | ® |
® |
® |
注册商标 |
| ™ | ™ |
™ |
™ |
商标 |
| € | € |
€ |
€ |
欧元 |
| £ | £ |
£ |
£ |
英镑 |
| ¥ | ¥ |
¥ |
¥ |
日元/人民币 |
|
  |
  |
不换行空格 |
URL 编码
URL 编码原理
URL 编码(Percent-encoding)用于在 URL 中安全传输特殊字符:
空格 → %20 或 +
中文 → UTF-8 字节的十六进制
URL 编码实现
class URLEncoder {
static encode(str) {
return encodeURIComponent(str);
}
static decode(str) {
return decodeURIComponent(str);
}
static encodeURI(str) {
return encodeURI(str);
}
static decodeURI(str) {
return decodeURI(str);
}
static encodeQueryParam(params) {
return Object.entries(params)
.map(([key, value]) =>
`${encodeURIComponent(key)}=${encodeURIComponent(value)}`
)
.join('&');
}
static decodeQueryParam(queryString) {
const params = {};
const pairs = queryString.replace(/^\?/, '').split('&');
for (const pair of pairs) {
const [key, value] = pair.split('=');
params[decodeURIComponent(key)] = decodeURIComponent(value || '');
}
return params;
}
static encodeRFC3986(str) {
return encodeURIComponent(str)
.replace(/[!'()*]/g, c => `%${c.charCodeAt(0).toString(16).toUpperCase()}`);
}
}
// 使用示例
console.log(URLEncoder.encode('你好 世界'));
// "%E4%BD%A0%E5%A5%BD%20%E4%B8%96%E7%95%8C"
console.log(URLEncoder.encodeQueryParam({
name: '张三',
message: 'Hello World!'
}));
// "name=%E5%BC%A0%E4%B8%89&message=Hello%20World!"
实际应用场景
1. XSS 防护
function sanitizeHTML(input) {
return HTMLEntityEncoder.encode(input);
}
function createSafeElement(tag, text) {
const element = document.createElement(tag);
element.textContent = text; // 自动转义
return element;
}
// 不安全的做法
element.innerHTML = userInput; // ❌ XSS 风险
// 安全的做法
element.textContent = userInput; // ✅ 自动转义
element.innerHTML = sanitizeHTML(userInput); // ✅ 手动转义
2. 国际化文本处理
function normalizeText(str) {
// NFD: 分解
// NFC: 组合
// NFKD: 兼容分解
// NFKC: 兼容组合
return str.normalize('NFC');
}
function compareStrings(a, b, locale = 'zh-CN') {
return a.localeCompare(b, locale);
}
// 处理全角/半角转换
function toHalfWidth(str) {
return str.replace(/[\uFF01-\uFF5E]/g, char =>
String.fromCharCode(char.charCodeAt(0) - 0xFEE0)
).replace(/\u3000/g, ' ');
}
function toFullWidth(str) {
return str.replace(/[\x21-\x7E]/g, char =>
String.fromCharCode(char.charCodeAt(0) + 0xFEE0)
).replace(/ /g, '\u3000');
}
3. 文件编码检测与转换
async function detectEncoding(file) {
const buffer = await file.arrayBuffer();
const bytes = new Uint8Array(buffer);
// 检测 BOM
if (bytes[0] === 0xEF && bytes[1] === 0xBB && bytes[2] === 0xBF) {
return 'UTF-8';
}
if (bytes[0] === 0xFF && bytes[1] === 0xFE) {
return 'UTF-16LE';
}
if (bytes[0] === 0xFE && bytes[1] === 0xFF) {
return 'UTF-16BE';
}
// 尝试 UTF-8 解码
try {
new TextDecoder('utf-8', { fatal: true }).decode(bytes);
return 'UTF-8';
} catch {
return 'unknown';
}
}
async function convertEncoding(file, fromEncoding, toEncoding) {
const buffer = await file.arrayBuffer();
const decoder = new TextDecoder(fromEncoding);
const text = decoder.decode(buffer);
const encoder = new TextEncoder(); // 总是 UTF-8
return encoder.encode(text);
}
4. 邮件编码
function encodeQuotedPrintable(str) {
return str.replace(/[^\x20-\x7E]|=/g, char => {
const code = char.charCodeAt(0);
return `=${code.toString(16).toUpperCase().padStart(2, '0')}`;
});
}
function encodeBase64MIME(str) {
const base64 = btoa(unescape(encodeURIComponent(str)));
return `=?UTF-8?B?${base64}?=`;
}
// 使用示例
console.log(encodeBase64MIME('你好'));
// "=?UTF-8?B?5L2g5aW9?="
常见问题与解决方案
1. 乱码问题
// 问题:UTF-8 文件用 GBK 打开
// 解决:指定正确编码
const decoder = new TextDecoder('utf-8');
const text = decoder.decode(buffer);
// 问题:数据库乱码
// 解决:确保连接编码一致
// SET NAMES utf8mb4;
2. Emoji 处理
// 问题:Emoji 长度计算错误
'👨👩👧👦'.length; // 11 (错误)
// 解决:使用 spread 或 Array.from
[...'👨👩👧👦'].length; // 7 (ZWJ 序列)
// 获取真实字符数
function getCharacterCount(str) {
const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' });
return [...segmenter.segment(str)].length;
}
3. 代理对问题
// 问题:BMP 外字符处理
const emoji = '😀';
emoji.length; // 2 (代理对)
emoji.charCodeAt(0); // 55357 (高代理)
emoji.charCodeAt(1); // 56832 (低代理)
// 解决:使用 codePointAt
emoji.codePointAt(0); // 128512 (正确码点)
String.fromCodePoint(128512); // '😀'
总结
字符编码是计算机处理文本的基础,核心要点:
- ASCII:基础编码,仅支持英文
- Unicode:统一字符集,为所有字符分配码点
- UTF-8:变长编码,兼容 ASCII,最广泛使用
- HTML 实体:在 HTML 中安全显示特殊字符
- URL 编码:在 URL 中安全传输特殊字符
如需快速进行编码转换,可以使用我们的在线工具:
- HTML 实体编码工具 - HTML 实体编解码
- ASCII Unicode 转换器 - 字符编码转换
- URL 编码工具 - URL 编解码
相关资源
- Base64 编码工具 - Base64 编解码
- 进制转换器 - 数字进制转换
- JSON 转义工具 - JSON 字符串转义