Text comparison is a core functionality in software development. From Git version control to code review, from document comparison to data synchronization, efficient diff algorithms are essential. This article provides an in-depth explanation of text comparison principles and implementations.
Text Diff Basics
What is Diff?
Diff (difference) refers to the differences between two texts. The goal of a diff algorithm is to find the minimum edit operations needed to transform text A into text B:
Text A: "Hello World"
Text B: "Hello Diff World"
Difference: Insert "Diff " after "Hello "
Edit Operation Types
| Operation | Symbol | Description |
|---|---|---|
| Insert | + | Add new content |
| Delete | - | Remove content |
| Keep | = | Content unchanged |
| Replace | ~ | Delete then insert |
Use Cases
- Version Control: Git, SVN, etc.
- Code Review: Pull Request comparisons
- Document Collaboration: Google Docs, Notion
- Data Synchronization: Incremental updates
- Spell Checking: Edit distance calculation
Longest Common Subsequence (LCS)
LCS Principles
The Longest Common Subsequence is the foundation of diff algorithms. LCS finds the longest common part of two sequences (not necessarily contiguous):
Sequence A: A B C D E F
Sequence B: A C D F
LCS: A C D F (length 4)
Dynamic Programming Solution
The classic solution for LCS uses dynamic programming:
State transition:
If A[i] == B[j]:
dp[i][j] = dp[i-1][j-1] + 1
Else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
JavaScript Implementation
class LCS {
static compute(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
return dp[m][n];
}
static getSequence(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
}
// Usage
console.log(LCS.compute('ABCDEF', 'ACDF')); // 4
console.log(LCS.getSequence('ABCDEF', 'ACDF')); // ['A', 'C', 'D', 'F']
Python Implementation
class LCS:
@staticmethod
def compute(a: str, b: str) -> int:
m, n = len(a), len(b)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
@staticmethod
def get_sequence(a: str, b: str) -> list:
m, n = len(a), len(b)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if a[i - 1] == b[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
lcs = []
i, j = m, n
while i > 0 and j > 0:
if a[i - 1] == b[j - 1]:
lcs.append(a[i - 1])
i -= 1
j -= 1
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1
return lcs[::-1]
# Usage
print(LCS.compute('ABCDEF', 'ACDF')) # 4
print(LCS.get_sequence('ABCDEF', 'ACDF')) # ['A', 'C', 'D', 'F']
Myers Diff Algorithm
Myers Algorithm Principles
Myers algorithm is the diff algorithm used by Git, with time complexity O(ND), where N is the sum of both sequence lengths and D is the edit distance.
Core idea: Find the shortest path from (0,0) to (M,N) on an edit graph.
Edit graph example (A="ABCABBA", B="CBABAC"):
C B A B A C
0 1 2 3 4 5 6
A 1 \
B 2 \
C 3 \
A 4 \
B 5 \
B 6 \
A 7 \
Diagonal move = Match (free)
Horizontal move = Insert
Vertical move = Delete
Myers Algorithm Implementation
class MyersDiff {
static diff(a, b) {
const n = a.length;
const m = b.length;
const max = n + m;
const v = new Map();
v.set(1, 0);
const trace = [];
for (let d = 0; d <= max; d++) {
trace.push(new Map(v));
for (let k = -d; k <= d; k += 2) {
let x;
if (k === -d || (k !== d && v.get(k - 1) < v.get(k + 1))) {
x = v.get(k + 1);
} else {
x = v.get(k - 1) + 1;
}
let y = x - k;
while (x < n && y < m && a[x] === b[y]) {
x++;
y++;
}
v.set(k, x);
if (x >= n && y >= m) {
return this.backtrack(trace, a, b, n, m);
}
}
}
return [];
}
static backtrack(trace, a, b, n, m) {
const result = [];
let x = n;
let y = m;
for (let d = trace.length - 1; d >= 0; d--) {
const v = trace[d];
const k = x - y;
let prevK;
if (k === -d || (k !== d && v.get(k - 1) < v.get(k + 1))) {
prevK = k + 1;
} else {
prevK = k - 1;
}
const prevX = v.get(prevK);
const prevY = prevX - prevK;
while (x > prevX && y > prevY) {
result.unshift({ type: 'equal', value: a[x - 1] });
x--;
y--;
}
if (d > 0) {
if (x === prevX) {
result.unshift({ type: 'insert', value: b[y - 1] });
y--;
} else {
result.unshift({ type: 'delete', value: a[x - 1] });
x--;
}
}
}
return result;
}
}
// Usage
const diff = MyersDiff.diff('ABCABBA', 'CBABAC');
console.log(diff);
Line-Level Text Diff
Line Diff Implementation
class LineDiff {
static diff(textA, textB) {
const linesA = textA.split('\n');
const linesB = textB.split('\n');
const lcs = this.lcsLines(linesA, linesB);
const result = [];
let i = 0, j = 0, k = 0;
while (i < linesA.length || j < linesB.length) {
if (k < lcs.length && i < linesA.length && linesA[i] === lcs[k]) {
if (j < linesB.length && linesB[j] === lcs[k]) {
result.push({ type: 'equal', line: linesA[i], lineA: i + 1, lineB: j + 1 });
i++;
j++;
k++;
} else {
result.push({ type: 'insert', line: linesB[j], lineB: j + 1 });
j++;
}
} else if (i < linesA.length) {
result.push({ type: 'delete', line: linesA[i], lineA: i + 1 });
i++;
} else if (j < linesB.length) {
result.push({ type: 'insert', line: linesB[j], lineB: j + 1 });
j++;
}
}
return result;
}
static lcsLines(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
}
// Usage
const textA = `function hello() {
console.log("Hello");
return true;
}`;
const textB = `function hello() {
console.log("Hello World");
console.log("Welcome");
return true;
}`;
const diff = LineDiff.diff(textA, textB);
console.log(diff);
Edit Distance (Levenshtein Distance)
Edit Distance Principles
Edit distance is the minimum number of edit operations needed to transform one string into another:
"kitten" → "sitting"
1. kitten → sitten (replace k → s)
2. sitten → sittin (replace e → i)
3. sittin → sitting (insert g)
Edit distance = 3
Edit Distance Implementation
class LevenshteinDistance {
static compute(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 0; i <= m; i++) dp[i][0] = i;
for (let j = 0; j <= n; j++) dp[0][j] = j;
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + Math.min(
dp[i - 1][j],
dp[i][j - 1],
dp[i - 1][j - 1]
);
}
}
}
return dp[m][n];
}
static similarity(a, b) {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 1;
const distance = this.compute(a, b);
return 1 - distance / maxLen;
}
}
// Usage
console.log(LevenshteinDistance.compute('kitten', 'sitting')); // 3
console.log(LevenshteinDistance.similarity('hello', 'hallo')); // 0.8
Word-Level Diff
class WordDiff {
static diff(textA, textB) {
const wordsA = this.tokenize(textA);
const wordsB = this.tokenize(textB);
const lcs = this.lcs(wordsA, wordsB);
const result = [];
let i = 0, j = 0, k = 0;
while (i < wordsA.length || j < wordsB.length) {
if (k < lcs.length && i < wordsA.length && wordsA[i] === lcs[k]) {
if (j < wordsB.length && wordsB[j] === lcs[k]) {
result.push({ type: 'equal', value: wordsA[i] });
i++;
j++;
k++;
} else {
result.push({ type: 'insert', value: wordsB[j] });
j++;
}
} else if (i < wordsA.length) {
result.push({ type: 'delete', value: wordsA[i] });
i++;
} else if (j < wordsB.length) {
result.push({ type: 'insert', value: wordsB[j] });
j++;
}
}
return result;
}
static tokenize(text) {
return text.match(/\S+|\s+/g) || [];
}
static lcs(a, b) {
const m = a.length;
const n = b.length;
const dp = Array(m + 1).fill(null)
.map(() => Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (a[i - 1] === b[j - 1]) {
lcs.unshift(a[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
static toHTML(diff) {
return diff.map(item => {
switch (item.type) {
case 'delete':
return `<del class="diff-delete">${this.escapeHTML(item.value)}</del>`;
case 'insert':
return `<ins class="diff-insert">${this.escapeHTML(item.value)}</ins>`;
default:
return this.escapeHTML(item.value);
}
}).join('');
}
static escapeHTML(str) {
return str
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>');
}
}
// Usage
const diff = WordDiff.diff(
'The quick brown fox jumps over the lazy dog',
'The quick red fox leaps over the lazy cat'
);
console.log(WordDiff.toHTML(diff));
Performance Optimization
Space Optimization
function lcsOptimized(a, b) {
const m = a.length;
const n = b.length;
let prev = Array(n + 1).fill(0);
let curr = Array(n + 1).fill(0);
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) {
curr[j] = prev[j - 1] + 1;
} else {
curr[j] = Math.max(prev[j], curr[j - 1]);
}
}
[prev, curr] = [curr, prev];
}
return prev[n];
}
Chunked Processing
function diffLargeFiles(textA, textB, chunkSize = 1000) {
const linesA = textA.split('\n');
const linesB = textB.split('\n');
const results = [];
for (let i = 0; i < Math.max(linesA.length, linesB.length); i += chunkSize) {
const chunkA = linesA.slice(i, i + chunkSize).join('\n');
const chunkB = linesB.slice(i, i + chunkSize).join('\n');
const diff = LineDiff.diff(chunkA, chunkB);
results.push(...diff);
}
return results;
}
Summary
Text diff algorithms are fundamental infrastructure in software development. Key points:
- LCS Algorithm: Find the longest common subsequence, the foundation of diff
- Myers Algorithm: Efficient diff algorithm used by Git
- Edit Distance: Measure similarity between two strings
- Line/Character Level: Different granularity for different needs
For quick text comparison, try our online tools:
- Text Diff Tool - Line and character level comparison
- Code Diff Tool - Code comparison with syntax highlighting
Related Resources
- JSON Diff Tool - JSON structure comparison
- Markdown Editor - Live preview editor
- Regex Tester - Regular expression testing