Skip to content

Commit 025e5c4

Browse files
Merge pull request #491 from GridProtectionAlliance/optimize-levenshtein
GSF.Core: Optimize Levenshtein distance computations
2 parents ee9f5fd + 6ed05d6 commit 025e5c4

File tree

1 file changed

+43
-18
lines changed

1 file changed

+43
-18
lines changed

Source/Libraries/GSF.Core/FuzzyStrings/LevenshteinDistance.cs

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,29 +32,54 @@ public static partial class ComparisonMetrics
3232
{
3333
public static int LevenshteinDistance(this string source, string target)
3434
{
35-
if (source.Length == 0)
36-
{
37-
return target.Length;
38-
}
39-
if (target.Length == 0)
40-
{
41-
return source.Length;
42-
}
35+
// Optimize for wider matrix M, resulting in fewer swaps
36+
if (source.Length > target.Length)
37+
return target.LevenshteinDistance(source);
4338

44-
int distance = 0;
39+
// Given a matrix M where M[i,j] is the Levenshtein distance of the
40+
// first i characters in source and the first j characters in target,
41+
// arr0 and arr1 represent two consecutive rows of matrix M
42+
int rows = source.Length + 1;
43+
int columns = target.Length + 1;
44+
int[] arr0 = new int[columns];
45+
int[] arr1 = new int[columns];
4546

46-
if (source[source.Length - 1] == target[target.Length - 1])
47-
{
48-
distance = 0;
49-
}
50-
else
47+
// This fills in M[0] of the matrix
48+
// If source is empty, the distance is the number of characters in target
49+
for (int i = 0; i < columns; i++)
50+
arr0[i] = i;
51+
52+
// In the following loop, arr0 is M[i-1] and arr1 is M[i]
53+
// We fill in the values for M[i] given that M[i-1] has already been filled in
54+
for (int i = 1; i < rows; i++)
5155
{
52-
distance = 1;
56+
// Fill in M[i,0]
57+
// If target is empty, the distance is the number of characters in source
58+
arr1[0] = i;
59+
60+
for (int j = 1; j < columns; j++)
61+
{
62+
int distance = source[i - 1] == target[j - 1] ? 0 : 1;
63+
64+
// M[i,j] = min(M[i-1,j] + 1,
65+
// M[i,j-1] + 1,
66+
// M[i-1,j-1] + distance)
67+
//
68+
// This is the recursive case of Levenshtein using precomputed values instead of recursion
69+
arr1[j] = Common.Min(arr0[j] + 1, arr1[j - 1] + 1, arr0[j - 1] + distance);
70+
}
71+
72+
// Move M[i] into arr0 for the next iteration
73+
// We no longer need M[i-1] so we reuse that array for arr1 in the next iteration
74+
int[] temp = arr0;
75+
arr0 = arr1;
76+
arr1 = temp;
5377
}
5478

55-
return Math.Min(Math.Min(LevenshteinDistance(source.Substring(0, source.Length - 1), target) + 1,
56-
LevenshteinDistance(source, target.Substring(0, target.Length - 1))) + 1,
57-
LevenshteinDistance(source.Substring(0, source.Length - 1), target.Substring(0, target.Length - 1)) + distance);
79+
// After the final swap, arr0 has the final row of the matrix M[rows-1]
80+
// The last column contains the distance for all characters in source and all characters in target
81+
// aka M[i,j] where i == source.Length and j == target.Length
82+
return arr0[columns - 1];
5883
}
5984

6085
public static double NormalizedLevenshteinDistance(this string source, string target)

0 commit comments

Comments
 (0)