Skip to content

Commit d8dc29f

Browse files
committed
initial import
1 parent 6a42fe7 commit d8dc29f

File tree

6 files changed

+303
-46
lines changed

6 files changed

+303
-46
lines changed

README.md

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,80 @@
1-
string-similarity
2-
=================
1+
#string-similarity
2+
3+
A library implementing different string similarity algorithms.
4+
5+
Currently implemeted:
6+
- Levenshtein edit distance;
7+
- Jaro-Winkler similarity;
8+
- Longest Common Subsequence edit distance;
9+
- n-Gram distance.
10+
11+
## Download
12+
See [releases](https://github.com/tdebatty/java-string-similarity/releases).
13+
14+
## Levenshtein
15+
The Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.
16+
17+
```java
18+
import info.debatty.stringsimilarity.*;
19+
20+
public class MyApp {
21+
22+
public static void main (String[] args) {
23+
Levenshtein l = new Levenshtein();
24+
25+
System.out.println(l.distanceAbsolute("My string", "My $tring"));
26+
System.out.println(l.distance("My string", "My $tring"));
27+
System.out.println(l.similarity("My string", "My $tring"));
28+
}
29+
}
30+
```
31+
32+
## Jaro-Winkler
33+
Jaro-Winkler is string edit distance that was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The Jaro–Winkler distance metric is designed and best suited for short strings such as person names, and to detect typos.
34+
35+
```java
36+
import info.debatty.stringsimilarity.*;
37+
38+
public class MyApp {
39+
40+
41+
public static void main(String[] args) {
42+
JaroWinkler jw = new JaroWinkler();
43+
44+
System.out.println(jw.distance("My string", "My $tring"));
45+
System.out.println(jw.similarity("My string", "My $tring"));
46+
}
47+
}
48+
```
49+
50+
51+
## Longest Common Subsequence
52+
53+
The longest common subsequence (LCS) problem consists in finding the longest subsequence common to two (or more) sequences. It differs from problems of finding common substrings: unlike substrings, subsequences are not required to occupy consecutive positions within the original sequences.
54+
55+
It is used by the diff utility, by Git for reconciling multiple changes, etc.
56+
57+
The LCS distance between Strings X (length n) and Y (length m) is n + m - 2 |LCS(X, Y)|
58+
min = 0
59+
max = n + m
60+
61+
LCS distance is equivalent to Levenshtein distance, when only insertion and deletion is allowed (no substitution), or when the cost of the substitution is the double of the cost of an insertion or deletion.
62+
63+
This class currently implements the dynamic programming approach, which has a space requirement O(m * n)
64+
65+
```java
66+
import info.debatty.stringsimilarity.*;
67+
68+
public class MyApp {
69+
public static void main(String[] args) {
70+
LongestCommonSubsequence lcs = new LongestCommonSubsequence();
71+
72+
System.out.println(lcs.length("AGCAT", "GAC"));
73+
System.out.println(lcs.distanceAbsolute("AGCAT", "GAC"));
74+
System.out.println(lcs.distance("AGCAT", "GAC"));
75+
}
76+
}
77+
```
78+
379

4-
JAR implementing various string similarity and distance algorthms (Levenshtein, weighted levenshtein, jaro, jaro-winkler,...)
580

6-
Download JAR and documentation: http://www.debatty.info

string-similarity/src/info/debatty/stringsimilarity/JaroWinkler.java renamed to src/info/debatty/stringsimilarity/JaroWinkler.java

Lines changed: 63 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,23 @@
66
*
77
* @author tibo
88
*/
9-
public class JaroWinkler {
9+
public class JaroWinkler implements StringSimilarityInterface {
10+
11+
12+
public static void main(String[] args) {
13+
JaroWinkler jw = new JaroWinkler();
14+
15+
System.out.println(jw.distance("My string", "My $tring"));
16+
System.out.println(jw.similarity("My string", "My $tring"));
17+
}
1018

1119
/**
12-
* The Jaro–Winkler distance is in fact a measure of similarity between two
13-
* strings: 0 means no similarity and 1 is an exact match.
20+
* Jaro-Winkler is string edit distance that was developed in the area of
21+
* record linkage (duplicate detection) (Winkler, 1990).
22+
*
23+
* The Jaro–Winkler distance metric is designed and best suited for short
24+
* strings such as person names, and to detect typos.
25+
*
1426
* http://en.wikipedia.org/wiki/Jaro-Winkler_distance
1527
*
1628
* @param s0
@@ -19,11 +31,57 @@ public class JaroWinkler {
1931
*/
2032
public static double Similarity(String s0, String s1) {
2133
JaroWinkler jw = new JaroWinkler();
22-
return jw.sim(s0, s1);
34+
return jw.similarity(s0, s1);
2335
}
2436

37+
private double threshold = 0.7;
38+
39+
public JaroWinkler() {
40+
41+
}
2542

26-
private float threshold = 0.7f;
43+
public JaroWinkler(double threshold) {
44+
this.setThreshold(threshold);
45+
}
46+
47+
@Override
48+
public double similarity(String s1, String s2) {
49+
int[] mtp = matches(s1, s2);
50+
float m = mtp[0];
51+
if (m == 0) {
52+
return 0f;
53+
}
54+
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
55+
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
56+
* (1 - j);
57+
return jw;
58+
}
59+
60+
@Override
61+
public double distance(String s1, String s2) {
62+
return 1.0 - similarity(s1, s2);
63+
}
64+
65+
/**
66+
* Sets the threshold used to determine when Winkler bonus should be used.
67+
* Set to a negative value to get the Jaro distance.
68+
* Default value is 0.7
69+
*
70+
* @param threshold the new value of the threshold
71+
*/
72+
public final void setThreshold(double threshold) {
73+
this.threshold = threshold;
74+
}
75+
76+
/**
77+
* Returns the current value of the threshold used for adding the Winkler
78+
* bonus. The default value is 0.7.
79+
*
80+
* @return the current value of the threshold
81+
*/
82+
public double getThreshold() {
83+
return threshold;
84+
}
2785

2886
private int[] matches(String s1, String s2) {
2987
String max, min;
@@ -81,37 +139,4 @@ private int[] matches(String s1, String s2) {
81139
}
82140
return new int[]{matches, transpositions / 2, prefix, max.length()};
83141
}
84-
85-
public float sim(String s1, String s2) {
86-
int[] mtp = matches(s1, s2);
87-
float m = mtp[0];
88-
if (m == 0) {
89-
return 0f;
90-
}
91-
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
92-
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
93-
* (1 - j);
94-
return jw;
95-
}
96-
97-
/**
98-
* Sets the threshold used to determine when Winkler bonus should be used.
99-
* Set to a negative value to get the Jaro distance.
100-
*
101-
* @param threshold the new value of the threshold
102-
*/
103-
public void setThreshold(float threshold) {
104-
this.threshold = threshold;
105-
}
106-
107-
/**
108-
* Returns the current value of the threshold used for adding the Winkler
109-
* bonus. The default value is 0.7.
110-
*
111-
* @return the current value of the threshold
112-
*/
113-
public float getThreshold() {
114-
return threshold;
115-
}
116-
117142
}

string-similarity/src/info/debatty/stringsimilarity/Levenshtein.java renamed to src/info/debatty/stringsimilarity/Levenshtein.java

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,38 @@
11
package info.debatty.stringsimilarity;
22

33
/**
4-
*
5-
* @author tibo
4+
* The Levenshtein distance between two words is the minimum number of
5+
* single-character edits (insertions, deletions or substitutions) required to
6+
* change one word into the other.
7+
*
8+
* @author Thibault Debatty
69
*/
7-
public class Levenshtein {
10+
public class Levenshtein implements StringSimilarityInterface {
11+
12+
public static void main (String[] args) {
13+
Levenshtein l = new Levenshtein();
14+
15+
System.out.println(l.distanceAbsolute("My string", "My $tring"));
16+
System.out.println(l.distance("My string", "My $tring"));
17+
System.out.println(l.similarity("My string", "My $tring"));
18+
}
19+
20+
public static int Distance(String s1, String s2) {
21+
Levenshtein l = new Levenshtein();
22+
return l.distanceAbsolute(s1, s2);
23+
}
824

25+
@Override
26+
public double distance(String s1, String s2) {
27+
return ((double) distanceAbsolute(s1, s2)) / Math.max(s1.length(), s2.length());
28+
29+
}
30+
31+
@Override
32+
public double similarity(String s1, String s2) {
33+
return 1.0 - distance(s1, s2);
34+
}
35+
936
/**
1037
* The Levenshtein distance, or edit distance, between two words is the
1138
* minimum number of single-character edits (i.e. insertions, deletions or
@@ -25,7 +52,7 @@ public class Levenshtein {
2552
* @param s1
2653
* @return
2754
*/
28-
public static int Distance(String s0, String s1) {
55+
public int distanceAbsolute(String s0, String s1) {
2956
int len0 = s0.length() + 1;
3057
int len1 = s1.length() + 1;
3158

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package info.debatty.stringsimilarity;
2+
3+
/**
4+
* The longest common subsequence (LCS) problem consists in finding the
5+
* longest subsequence common to two (or more) sequences. It differs from
6+
* problems of finding common substrings: unlike substrings, subsequences are
7+
* not required to occupy consecutive positions within the original sequences.
8+
*
9+
* It is used by the diff utility, by Git for reconciling multiple changes, etc.
10+
*
11+
* The LCS distance between Strings X (length n) and Y (length m) is
12+
* n + m - 2 |LCS(X, Y)|
13+
* min = 0
14+
* max = n + m
15+
*
16+
* LCS distance is equivalent to Levenshtein distance, when only insertion and
17+
* deletion is allowed (no substitution), or when the cost of the substitution
18+
* is the double of the cost of an insertion or deletion.
19+
*
20+
* ! This class currently implements the dynamic programming approach, which
21+
* has a space requirement O(m * n)!
22+
*
23+
* @author tibo
24+
*/
25+
public class LongestCommonSubsequence implements StringSimilarityInterface {
26+
27+
/**
28+
* @param args the command line arguments
29+
*/
30+
public static void main(String[] args) {
31+
LongestCommonSubsequence lcs = new LongestCommonSubsequence();
32+
33+
System.out.println(lcs.length("AGCAT", "GAC"));
34+
System.out.println(lcs.distanceAbsolute("AGCAT", "GAC"));
35+
System.out.println(lcs.distance("AGCAT", "GAC"));
36+
}
37+
38+
public static int Distance(String s1, String s2) {
39+
LongestCommonSubsequence lcs = new LongestCommonSubsequence();
40+
return lcs.distanceAbsolute(s1, s2);
41+
}
42+
43+
@Override
44+
public double similarity(String s1, String s2) {
45+
return 1.0 - distance(s1, s2);
46+
}
47+
48+
@Override
49+
public double distance(String s1, String s2) {
50+
return ((double) distanceAbsolute(s1, s2)) / (s1.length() + s2.length());
51+
}
52+
53+
54+
public int distanceAbsolute(String s1, String s2) {
55+
return s1.length() + s2.length() - 2 * length(s1, s2);
56+
}
57+
58+
public int length(String s1, String s2) {
59+
/* function LCSLength(X[1..m], Y[1..n])
60+
C = array(0..m, 0..n)
61+
62+
for i := 0..m
63+
C[i,0] = 0
64+
65+
for j := 0..n
66+
C[0,j] = 0
67+
68+
for i := 1..m
69+
for j := 1..n
70+
if X[i] = Y[j]
71+
C[i,j] := C[i-1,j-1] + 1
72+
else
73+
C[i,j] := max(C[i,j-1], C[i-1,j])
74+
return C[m,n]
75+
*/
76+
int m = s1.length();
77+
int n = s2.length();
78+
char[] X = s1.toCharArray();
79+
char[] Y = s2.toCharArray();
80+
81+
int[][] C = new int[m+1][n+1];
82+
83+
for (int i = 0; i <= m; i++) {
84+
C[i][0] = 0;
85+
}
86+
87+
for (int j = 0; j <= n; j++) {
88+
C[0][j] = 0;
89+
}
90+
91+
for (int i = 1; i <=m ; i++) {
92+
for (int j = 1; j <= n; j++) {
93+
if (X[i-1] == Y[j-1]) {
94+
C[i][j] = C[i-1][j-1] + 1;
95+
96+
} else {
97+
C[i][j] = Math.max(C[i][j-1], C[i-1][j]);
98+
}
99+
}
100+
}
101+
102+
return C[m][n];
103+
104+
}
105+
106+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
package info.debatty.stringsimilarity;
3+
4+
/**
5+
*
6+
* @author tibo
7+
*/
8+
public interface StringSimilarityInterface {
9+
/**
10+
*
11+
* @param s1
12+
* @param s2
13+
* @return similarity between 0 (completely different) and 1 (s1 = s2)
14+
*/
15+
public double similarity(String s1, String s2);
16+
17+
/**
18+
* Generally, distance = 1 - similarity.
19+
* Some implementations can also provide a method distanceAbsolute
20+
* @param s1
21+
* @param s2
22+
* @return distance between 0 (s1 = s2) and 1 (completely different)
23+
*/
24+
public double distance(String s1, String s2);
25+
}

0 commit comments

Comments
 (0)