Skip to content

Commit 4c1aadf

Browse files
author
tibo
committed
N-gam distance
1 parent 797a7e2 commit 4c1aadf

File tree

1 file changed

+100
-0
lines changed
  • string-similarity/src/info/debatty/stringsimilarity

1 file changed

+100
-0
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package info.debatty.stringsimilarity;
2+
3+
/**
4+
*
5+
* http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
6+
* @author tibo
7+
*/
8+
public class NGram {
9+
10+
public static double Distance(String s0, String s1) {
11+
return Distance(s0, s1, 2);
12+
}
13+
14+
private static double Distance(String s0, String s1, int n) {
15+
final int sl = s0.length();
16+
final int tl = s1.length();
17+
18+
if (sl == 0 || tl == 0) {
19+
if (sl == tl) {
20+
return 1;
21+
} else {
22+
return 0;
23+
}
24+
}
25+
26+
int cost = 0;
27+
if (sl < n || tl < n) {
28+
for (int i = 0, ni = Math.min(sl, tl); i < ni; i++) {
29+
if (s0.charAt(i) == s1.charAt(i)) {
30+
cost++;
31+
}
32+
}
33+
return (float) cost / Math.max(sl, tl);
34+
}
35+
36+
char[] sa = new char[sl + n - 1];
37+
float p[]; //'previous' cost array, horizontally
38+
float d[]; // cost array, horizontally
39+
float _d[]; //placeholder to assist in swapping p and d
40+
41+
//construct sa with prefix
42+
for (int i = 0; i < sa.length; i++) {
43+
if (i < n - 1) {
44+
sa[i] = 0; //add prefix
45+
} else {
46+
sa[i] = s0.charAt(i - n + 1);
47+
}
48+
}
49+
p = new float[sl + 1];
50+
d = new float[sl + 1];
51+
52+
// indexes into strings s and t
53+
int i; // iterates through source
54+
int j; // iterates through target
55+
56+
char[] t_j = new char[n]; // jth n-gram of t
57+
58+
for (i = 0; i <= sl; i++) {
59+
p[i] = i;
60+
}
61+
62+
for (j = 1; j <= tl; j++) {
63+
//construct t_j n-gram
64+
if (j < n) {
65+
for (int ti = 0; ti < n - j; ti++) {
66+
t_j[ti] = 0; //add prefix
67+
}
68+
for (int ti = n - j; ti < n; ti++) {
69+
t_j[ti] = s1.charAt(ti - (n - j));
70+
}
71+
} else {
72+
t_j = s1.substring(j - n, j).toCharArray();
73+
}
74+
d[0] = j;
75+
for (i = 1; i <= sl; i++) {
76+
cost = 0;
77+
int tn = n;
78+
//compare sa to t_j
79+
for (int ni = 0; ni < n; ni++) {
80+
if (sa[i - 1 + ni] != t_j[ni]) {
81+
cost++;
82+
} else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
83+
tn--;
84+
}
85+
}
86+
float ec = (float) cost / tn;
87+
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
88+
d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
89+
}
90+
// copy current distance counts to 'previous row' distance counts
91+
_d = p;
92+
p = d;
93+
d = _d;
94+
}
95+
96+
// our last action in the above loop was to switch d and p, so p now
97+
// actually has the most recent cost counts
98+
return 1.0 - (p[sl] / Math.max(tl, sl));
99+
}
100+
}

0 commit comments

Comments
 (0)