@@ -32,29 +32,54 @@ public static partial class ComparisonMetrics
3232 {
3333 public static int LevenshteinDistance ( this string source , string target )
3434 {
35- if ( source . Length == 0 )
36- {
37- return target . Length ;
38- }
39- if ( target . Length == 0 )
40- {
41- return source . Length ;
42- }
35+ // Optimize for wider matrix M, resulting in fewer swaps
36+ if ( source . Length > target . Length )
37+ return target . LevenshteinDistance ( source ) ;
4338
44- int distance = 0 ;
39+ // Given a matrix M where M[i,j] is the Levenshtein distance of the
40+ // first i characters in source and the first j characters in target,
41+ // arr0 and arr1 represent two consecutive rows of matrix M
42+ int rows = source . Length + 1 ;
43+ int columns = target . Length + 1 ;
44+ int [ ] arr0 = new int [ columns ] ;
45+ int [ ] arr1 = new int [ columns ] ;
4546
46- if ( source [ source . Length - 1 ] == target [ target . Length - 1 ] )
47- {
48- distance = 0 ;
49- }
50- else
47+ // This fills in M[0] of the matrix
48+ // If source is empty, the distance is the number of characters in target
49+ for ( int i = 0 ; i < columns ; i ++ )
50+ arr0 [ i ] = i ;
51+
52+ // In the following loop, arr0 is M[i-1] and arr1 is M[i]
53+ // We fill in the values for M[i] given that M[i-1] has already been filled in
54+ for ( int i = 1 ; i < rows ; i ++ )
5155 {
52- distance = 1 ;
56+ // Fill in M[i,0]
57+ // If target is empty, the distance is the number of characters in source
58+ arr1 [ 0 ] = i ;
59+
60+ for ( int j = 1 ; j < columns ; j ++ )
61+ {
62+ int distance = source [ i - 1 ] == target [ j - 1 ] ? 0 : 1 ;
63+
64+ // M[i,j] = min(M[i-1,j] + 1,
65+ // M[i,j-1] + 1,
66+ // M[i-1,j-1] + distance)
67+ //
68+ // This is the recursive case of Levenshtein using precomputed values instead of recursion
69+ arr1 [ j ] = Common . Min ( arr0 [ j ] + 1 , arr1 [ j - 1 ] + 1 , arr0 [ j - 1 ] + distance ) ;
70+ }
71+
72+ // Move M[i] into arr0 for the next iteration
73+ // We no longer need M[i-1] so we reuse that array for arr1 in the next iteration
74+ int [ ] temp = arr0 ;
75+ arr0 = arr1 ;
76+ arr1 = temp ;
5377 }
5478
55- return Math . Min ( Math . Min ( LevenshteinDistance ( source . Substring ( 0 , source . Length - 1 ) , target ) + 1 ,
56- LevenshteinDistance ( source , target . Substring ( 0 , target . Length - 1 ) ) ) + 1 ,
57- LevenshteinDistance ( source . Substring ( 0 , source . Length - 1 ) , target . Substring ( 0 , target . Length - 1 ) ) + distance ) ;
79+ // After the final swap, arr0 has the final row of the matrix M[rows-1]
80+ // The last column contains the distance for all characters in source and all characters in target
81+ // aka M[i,j] where i == source.Length and j == target.Length
82+ return arr0 [ columns - 1 ] ;
5883 }
5984
6085 public static double NormalizedLevenshteinDistance ( this string source , string target )
0 commit comments