文本对比是软件开发中的核心功能,从 Git 版本控制到代码审查,从文档比较到数据同步,都离不开高效的 Diff 算法。本文将深入讲解文本对比的原理和实现方法。
文本对比基础
什么是 Diff?
Diff(差异)是指两个文本之间的不同之处。Diff 算法的目标是找出将文本 A 转换为文本 B 所需的最小编辑操作:
文本 A: "Hello World"
文本 B: "Hello Diff World"
差异: 在 "Hello " 后插入 "Diff "
编辑操作类型
| 操作 | 符号 | 描述 |
|---|---|---|
| 插入 | + | 添加新内容 |
| 删除 | - | 移除内容 |
| 保持 | = | 内容不变 |
| 替换 | ~ | 删除后插入(可分解为删除+插入) |
应用场景
- 版本控制:Git、SVN 等
- 代码审查:Pull Request 对比
- 文档协作:Google Docs、Notion
- 数据同步:增量更新
- 拼写检查:编辑距离计算
最长公共子序列 (LCS)
LCS 原理
最长公共子序列是 Diff 算法的基础。LCS 找出两个序列中最长的公共部分(不要求连续):
序列 A: A B C D E F
序列 B: A C D F
LCS: A C D F(长度 4)
动态规划解法
LCS 问题的经典解法是动态规划:
状态转移方程:
如果 A[i] == B[j]:
dp[i][j] = dp[i-1][j-1] + 1
否则:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
JavaScript 实现
class LCS {
static compute(a, b) {
const m = a.length;
const n = b.length;
// 创建 DP 表
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
// 填充 DP 表
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
return dp[m][n];
}
static getSequence(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
// 回溯获取 LCS
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
}
// 使用示例
console.log(LCS.compute('ABCDEF', 'ACDF')); // 4
console.log(LCS.getSequence('ABCDEF', 'ACDF')); // ['A', 'C', 'D', 'F']
Python 实现
class LCS:
@staticmethod
def compute(a: str, b: str) -> int:
m, n = len(a), len(b)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
@staticmethod
def get_sequence(a: str, b: str) -> list:
m, n = len(a), len(b)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
# 回溯
lcs = []
i, j = m, n
while i > 0 and j > 0:
if a[i - 1] == b[j - 1]:
lcs.append(a[i - 1])
i -= 1
j -= 1
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1
return lcs[::-1]
# 使用示例
print(LCS.compute('ABCDEF', 'ACDF')) # 4
print(LCS.get_sequence('ABCDEF', 'ACDF')) # ['A', 'C', 'D', 'F']
Myers 差分算法
Myers 算法原理
Myers 算法是 Git 使用的 Diff 算法,时间复杂度为 O(ND),其中 N 是两个序列长度之和,D 是编辑距离。
核心思想:在编辑图上寻找从 (0,0) 到 (M,N) 的最短路径。
编辑图示例 (A="ABCABBA", B="CBABAC"):
C B A B A C
0 1 2 3 4 5 6
A 1 \
B 2 \
C 3 \
A 4 \
B 5 \
B 6 \
A 7 \
对角线移动 = 匹配(免费)
水平移动 = 插入
垂直移动 = 删除
Myers 算法实现
class MyersDiff {
static diff(a, b) {
const n = a.length;
const m = b.length;
const max = n + m;
const v = new Map();
v.set(1, 0);
const trace = [];
// 前向搜索
for (let d = 0; d <= max; d++) {
trace.push(new Map(v));
for (let k = -d; k <= d; k += 2) {
let x;
if (k === -d || (k !== d && v.get(k - 1) < v.get(k + 1))) {
x = v.get(k + 1); // 向下移动
} else {
x = v.get(k - 1) + 1; // 向右移动
}
let y = x - k;
// 沿对角线移动(匹配)
while (x < n && y < m && a[x] === b[y]) {
x++;
y++;
}
v.set(k, x);
if (x >= n && y >= m) {
return this.backtrack(trace, a, b, n, m);
}
}
}
return [];
}
static backtrack(trace, a, b, n, m) {
const result = [];
let x = n;
let y = m;
for (let d = trace.length - 1; d >= 0; d--) {
const v = trace[d];
const k = x - y;
let prevK;
if (k === -d || (k !== d && v.get(k - 1) < v.get(k + 1))) {
prevK = k + 1;
} else {
prevK = k - 1;
}
const prevX = v.get(prevK);
const prevY = prevX - prevK;
// 对角线移动(匹配)
while (x > prevX && y > prevY) {
result.unshift({ type: 'equal', value: a[x - 1] });
x--;
y--;
}
if (d > 0) {
if (x === prevX) {
result.unshift({ type: 'insert', value: b[y - 1] });
y--;
} else {
result.unshift({ type: 'delete', value: a[x - 1] });
x--;
}
}
}
return result;
}
}
// 使用示例
const diff = MyersDiff.diff('ABCABBA', 'CBABAC');
console.log(diff);
// [
// { type: 'delete', value: 'A' },
// { type: 'delete', value: 'B' },
// { type: 'equal', value: 'C' },
// { type: 'insert', value: 'B' },
// { type: 'equal', value: 'A' },
// { type: 'equal', value: 'B' },
// { type: 'delete', value: 'B' },
// { type: 'equal', value: 'A' },
// { type: 'insert', value: 'C' }
// ]
行级文本对比
行级 Diff 实现
class LineDiff {
static diff(textA, textB) {
const linesA = textA.split('\n');
const linesB = textB.split('\n');
// 使用 LCS 找出公共行
const lcs = this.lcsLines(linesA, linesB);
const result = [];
let i = 0, j = 0, k = 0;
while (i < linesA.length || j < linesB.length) {
if (k < lcs.length && i < linesA.length && linesA[i] === lcs[k]) {
if (j < linesB.length && linesB[j] === lcs[k]) {
result.push({ type: 'equal', line: linesA[i], lineA: i + 1, lineB: j + 1 });
i++;
j++;
k++;
} else {
result.push({ type: 'insert', line: linesB[j], lineB: j + 1 });
j++;
}
} else if (i < linesA.length) {
result.push({ type: 'delete', line: linesA[i], lineA: i + 1 });
i++;
} else if (j < linesB.length) {
result.push({ type: 'insert', line: linesB[j], lineB: j + 1 });
j++;
}
}
return result;
}
static lcsLines(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
static toUnifiedDiff(diff, contextLines = 3) {
const hunks = [];
let currentHunk = null;
let contextBuffer = [];
for (let i = 0; i < diff.length; i++) {
const item = diff[i];
if (item.type !== 'equal') {
if (!currentHunk) {
currentHunk = {
oldStart: item.lineA || item.lineB,
newStart: item.lineB || item.lineA,
oldLines: [],
newLines: [],
lines: [...contextBuffer.slice(-contextLines)]
};
}
if (item.type === 'delete') {
currentHunk.lines.push(`-${item.line}`);
} else {
currentHunk.lines.push(`+${item.line}`);
}
contextBuffer = [];
} else {
if (currentHunk) {
contextBuffer.push(` ${item.line}`);
if (contextBuffer.length <= contextLines) {
currentHunk.lines.push(` ${item.line}`);
} else {
hunks.push(currentHunk);
currentHunk = null;
contextBuffer = [` ${item.line}`];
}
} else {
contextBuffer.push(` ${item.line}`);
if (contextBuffer.length > contextLines) {
contextBuffer.shift();
}
}
}
}
if (currentHunk) {
hunks.push(currentHunk);
}
return hunks;
}
}
// 使用示例
const textA = `function hello() {
console.log("Hello");
return true;
}`;
const textB = `function hello() {
console.log("Hello World");
console.log("Welcome");
return true;
}`;
const diff = LineDiff.diff(textA, textB);
console.log(diff);
生成 Unified Diff 格式
class UnifiedDiff {
static generate(fileA, fileB, textA, textB) {
const diff = LineDiff.diff(textA, textB);
const hunks = LineDiff.toUnifiedDiff(diff);
let output = `--- ${fileA}\n+++ ${fileB}\n`;
for (const hunk of hunks) {
const oldCount = hunk.lines.filter(l => !l.startsWith('+')).length;
const newCount = hunk.lines.filter(l => !l.startsWith('-')).length;
output += `@@ -${hunk.oldStart},${oldCount} +${hunk.newStart},${newCount} @@\n`;
output += hunk.lines.join('\n') + '\n';
}
return output;
}
}
// 使用示例
const unifiedDiff = UnifiedDiff.generate('a.js', 'b.js', textA, textB);
console.log(unifiedDiff);
// --- a.js
// +++ b.js
// @@ -1,4 +1,5 @@
// function hello() {
// - console.log("Hello");
// + console.log("Hello World");
// + console.log("Welcome");
// return true;
// }
字符级对比
单词级 Diff
class WordDiff {
static diff(textA, textB) {
const wordsA = this.tokenize(textA);
const wordsB = this.tokenize(textB);
const lcs = this.lcs(wordsA, wordsB);
const result = [];
let i = 0, j = 0, k = 0;
while (i < wordsA.length || j < wordsB.length) {
if (k < lcs.length && i < wordsA.length && wordsA[i] === lcs[k]) {
if (j < wordsB.length && wordsB[j] === lcs[k]) {
result.push({ type: 'equal', value: wordsA[i] });
i++;
j++;
k++;
} else {
result.push({ type: 'insert', value: wordsB[j] });
j++;
}
} else if (i < wordsA.length) {
result.push({ type: 'delete', value: wordsA[i] });
i++;
} else if (j < wordsB.length) {
result.push({ type: 'insert', value: wordsB[j] });
j++;
}
}
return result;
}
static tokenize(text) {
return text.match(/\S+|\s+/g) || [];
}
static lcs(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
static toHTML(diff) {
return diff.map(item => {
switch (item.type) {
case 'delete':
return `<del class="diff-delete">${this.escapeHTML(item.value)}</del>`;
case 'insert':
return `<ins class="diff-insert">${this.escapeHTML(item.value)}</ins>`;
default:
return this.escapeHTML(item.value);
}
}).join('');
}
static escapeHTML(str) {
return str
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>');
}
}
// 使用示例
const diff = WordDiff.diff(
'The quick brown fox jumps over the lazy dog',
'The quick red fox leaps over the lazy cat'
);
console.log(WordDiff.toHTML(diff));
// The quick <del>brown</del><ins>red</ins> fox <del>jumps</del><ins>leaps</ins> over the lazy <del>dog</del><ins>cat</ins>
编辑距离 (Levenshtein Distance)
编辑距离原理
编辑距离是将一个字符串转换为另一个字符串所需的最少编辑操作数:
"kitten" → "sitting"
1. kitten → sitten (替换 k → s)
2. sitten → sittin (替换 e → i)
3. sittin → sitting (插入 g)
编辑距离 = 3
编辑距离实现
class LevenshteinDistance {
static compute(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
// 初始化
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
// 填充
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + Math.min(
dp[i - 1][j], // 删除
dp[i][j - 1], // 插入
dp[i - 1][j - 1] // 替换
);
}
}
}
return dp[m][n];
}
static similarity(a, b) {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 1;
const distance = this.compute(a, b);
return 1 - distance / maxLen;
}
static getOperations(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + Math.min(
dp[i - 1][j],
dp[i][j - 1],
dp[i - 1][j - 1]
);
}
}
}
// 回溯获取操作序列
const operations = [];
let i = m, j = n;
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && a[i - 1] === b[j - 1]) {
operations.unshift({ type: 'keep', char: a[i - 1] });
i--;
j--;
} else if (i > 0 && j > 0 && dp[i][j] === dp[i - 1][j - 1] + 1) {
operations.unshift({ type: 'replace', from: a[i - 1], to: b[j - 1] });
i--;
j--;
} else if (j > 0 && dp[i][j] === dp[i][j - 1] + 1) {
operations.unshift({ type: 'insert', char: b[j - 1] });
j--;
} else {
operations.unshift({ type: 'delete', char: a[i - 1] });
i--;
}
}
return operations;
}
}
// 使用示例
console.log(LevenshteinDistance.compute('kitten', 'sitting')); // 3
console.log(LevenshteinDistance.similarity('hello', 'hallo')); // 0.8
console.log(LevenshteinDistance.getOperations('kitten', 'sitting'));
实际应用
1. 代码对比组件
class CodeDiffViewer {
constructor(container) {
this.container = container;
}
render(codeA, codeB, options = {}) {
const { mode = 'split', language = 'javascript' } = options;
const diff = LineDiff.diff(codeA, codeB);
if (mode === 'split') {
return this.renderSplitView(diff);
} else {
return this.renderUnifiedView(diff);
}
}
renderSplitView(diff) {
let leftHTML = '<div class="diff-left">';
let rightHTML = '<div class="diff-right">';
for (const item of diff) {
const lineNum = item.lineA || item.lineB || '';
const escapedLine = this.escapeHTML(item.line);
switch (item.type) {
case 'delete':
leftHTML += `<div class="diff-line diff-delete">
<span class="line-num">${item.lineA}</span>
<span class="line-content">-${escapedLine}</span>
</div>`;
rightHTML += `<div class="diff-line diff-empty"></div>`;
break;
case 'insert':
leftHTML += `<div class="diff-line diff-empty"></div>`;
rightHTML += `<div class="diff-line diff-insert">
<span class="line-num">${item.lineB}</span>
<span class="line-content">+${escapedLine}</span>
</div>`;
break;
default:
leftHTML += `<div class="diff-line">
<span class="line-num">${item.lineA}</span>
<span class="line-content"> ${escapedLine}</span>
</div>`;
rightHTML += `<div class="diff-line">
<span class="line-num">${item.lineB}</span>
<span class="line-content"> ${escapedLine}</span>
</div>`;
}
}
leftHTML += '</div>';
rightHTML += '</div>';
return `<div class="diff-container split">${leftHTML}${rightHTML}</div>`;
}
renderUnifiedView(diff) {
let html = '<div class="diff-unified">';
for (const item of diff) {
const escapedLine = this.escapeHTML(item.line);
const lineA = item.lineA || '';
const lineB = item.lineB || '';
let className = 'diff-line';
let prefix = ' ';
if (item.type === 'delete') {
className += ' diff-delete';
prefix = '-';
} else if (item.type === 'insert') {
className += ' diff-insert';
prefix = '+';
}
html += `<div class="${className}">
<span class="line-num-old">${lineA}</span>
<span class="line-num-new">${lineB}</span>
<span class="line-content">${prefix}${escapedLine}</span>
</div>`;
}
html += '</div>';
return html;
}
escapeHTML(str) {
return str
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>');
}
}
2. 实时协作同步
class CollaborativeSync {
constructor() {
this.baseText = '';
this.pendingChanges = [];
}
applyChange(change) {
const { position, deleteCount, insertText } = change;
const before = this.baseText.slice(0, position);
const after = this.baseText.slice(position + deleteCount);
this.baseText = before + insertText + after;
return this.baseText;
}
computeChanges(oldText, newText) {
const diff = MyersDiff.diff(oldText, newText);
const changes = [];
let position = 0;
let currentChange = null;
for (const item of diff) {
if (item.type === 'equal') {
if (currentChange) {
changes.push(currentChange);
currentChange = null;
}
position++;
} else if (item.type === 'delete') {
if (!currentChange) {
currentChange = { position, deleteCount: 0, insertText: '' };
}
currentChange.deleteCount++;
position++;
} else if (item.type === 'insert') {
if (!currentChange) {
currentChange = { position, deleteCount: 0, insertText: '' };
}
currentChange.insertText += item.value;
}
}
if (currentChange) {
changes.push(currentChange);
}
return changes;
}
transformChange(changeA, changeB) {
// 操作转换 (OT) - 简化版本
if (changeA.position <= changeB.position) {
const offset = changeA.insertText.length - changeA.deleteCount;
return {
...changeB,
position: changeB.position + offset
};
}
return changeB;
}
}
3. 文档版本历史
class VersionHistory {
constructor() {
this.versions = [];
this.currentIndex = -1;
}
addVersion(content, metadata = {}) {
// 移除当前位置之后的版本
this.versions = this.versions.slice(0, this.currentIndex + 1);
const version = {
id: Date.now(),
content,
timestamp: new Date(),
...metadata
};
// 计算与上一版本的差异
if (this.versions.length > 0) {
const prevContent = this.versions[this.versions.length - 1].content;
version.diff = LineDiff.diff(prevContent, content);
version.stats = this.computeStats(version.diff);
}
this.versions.push(version);
this.currentIndex = this.versions.length - 1;
return version;
}
computeStats(diff) {
let additions = 0;
let deletions = 0;
for (const item of diff) {
if (item.type === 'insert') additions++;
if (item.type === 'delete') deletions++;
}
return { additions, deletions };
}
getVersion(index) {
return this.versions[index];
}
compareVersions(indexA, indexB) {
const versionA = this.versions[indexA];
const versionB = this.versions[indexB];
return LineDiff.diff(versionA.content, versionB.content);
}
undo() {
if (this.currentIndex > 0) {
this.currentIndex--;
return this.versions[this.currentIndex];
}
return null;
}
redo() {
if (this.currentIndex < this.versions.length - 1) {
this.currentIndex++;
return this.versions[this.currentIndex];
}
return null;
}
}
性能优化
1. 空间优化
// 使用滚动数组优化 LCS 空间复杂度
function lcsOptimized(a, b) {
const m = a.length;
const n = b.length;
// 只保留两行
let prev = Array(n + 1).fill(0);
let curr = Array(n + 1).fill(0);
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
curr[j] = prev[j - 1] + 1;
} else {
curr[j] = Math.max(prev[j], curr[j - 1]);
}
}
[prev, curr] = [curr, prev];
}
return prev[n];
}
2. 分块处理
function diffLargeFiles(textA, textB, chunkSize = 1000) {
const linesA = textA.split('\n');
const linesB = textB.split('\n');
const results = [];
for (let i = 0; i < Math.max(linesA.length, linesB.length); i += chunkSize) {
const chunkA = linesA.slice(i, i + chunkSize).join('\n');
const chunkB = linesB.slice(i, i + chunkSize).join('\n');
const diff = LineDiff.diff(chunkA, chunkB);
results.push(...diff);
}
return results;
}
总结
文本对比算法是软件开发的基础设施,核心要点:
- LCS 算法:找出最长公共子序列,是 Diff 的基础
- Myers 算法:Git 使用的高效 Diff 算法
- 编辑距离:衡量两个字符串的相似度
- 行级/字符级:不同粒度的对比需求
如需快速进行文本对比,可以使用我们的在线工具:
相关资源
- JSON 对比工具 - JSON 结构对比
- Markdown 编辑器 - 实时预览编辑器
- 正则表达式测试 - 正则匹配测试