Skip to content

Commit 7bd67ed

Browse files
committed
Refactored to use profiles
1 parent 7e9a333 commit 7bd67ed

File tree

7 files changed

+176
-232
lines changed

7 files changed

+176
-232
lines changed

README.md

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ public class MyApp {
8484
public static void main(String[] args) {
8585
Damerau d = new Damerau();
8686

87-
// One substitution
87+
// One transposition
8888
System.out.println(d.absoluteDistance("ABCDEF", "ABDCEF"));
8989

90-
// Substitution of 2 characters that are far from each other
90+
// Transposition of 2 characters that are far from each other
9191
// => 1 deletion + 1 insertion
9292
System.out.println(d.absoluteDistance("ABCDEF", "BCDAEF"));
9393

@@ -155,12 +155,34 @@ public class MyApp {
155155
}
156156
```
157157

158+
## N-Gram similarity (Kondrak)
159+
160+
N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
161+
162+
http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
163+
164+
The algorithm uses affixing with special character '\n' two increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longer word.
165+
166+
```java
167+
import info.debatty.java.stringsimilarity.*;
168+
169+
public class MyApp {
170+
171+
public static void main(String[] args) {
172+
NGram twogram = new NGram(2);
173+
174+
// Should be 0.41666
175+
System.out.println(twogram.distance("ABCD", "ABTUIO"));
176+
}
177+
}
178+
```
179+
158180
## Q-Gram
159181

160182
A-gram similarity and distance, as defined by Ukkonen in "Approximate string-matching with q-grams and maximal matches"
161183
http://www.sciencedirect.com/science/article/pii/0304397592901434
162184

163-
The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences of each k-shingle). Q-gram distance is a lower bound on Levenshtein distance, but can be computed in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
185+
The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences of each n-gram). Q-gram distance is a lower bound on Levenshtein distance, but can be computed in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
164186

165187
```java
166188
import info.debatty.java.stringsimilarity.*;
@@ -182,27 +204,13 @@ public class MyApp {
182204
}
183205
```
184206

185-
## N-Gram similarity (Kondrak)
186-
187-
N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
188-
189-
http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
190-
191-
The algorithm uses affixing with special character '\n' two increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longer word.
192-
193-
```java
194-
import info.debatty.java.stringsimilarity.*;
207+
## Cosine similarity
208+
Like Q-Gram similarity, the profile of each input string is first computed (the number of occurences of each n-gram). The two input strings are thus considered as vectors in the space of n-grams. The similarity between the two strings is the cosine of the angle between these two vectors, and is computed as V1 . V2 / (|V1| * |V2|)
195209

196-
public class MyApp {
197-
198-
public static void main(String[] args) {
199-
NGram twogram = new NGram(2);
200-
201-
// Should be 0.41666
202-
System.out.println(twogram.distance("ABCD", "ABTUIO"));
203-
}
204-
}
205-
```
210+
## Jaccard index
211+
Like Q-Gram similarity, the input strings are first converted into sets of n-grams (sequences of n characters, also called k-shingles), but this time the cardinality of each n-gram is not taken into account. Each input string is simply a set of n-grams. The Jaccard index is then computed as |A inter B| / |A union B|.
206212

213+
## Sorensen-Dice coefficient
214+
Similart to Jaccard index, but this time the similarity is computed as 2 * |A inter B| / (|A| + |B|).
207215

208216

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27+
import java.util.HashMap;
28+
import java.util.HashSet;
29+
import java.util.Set;
30+
2731
/**
2832
* Implements Cosine Similarity.
2933
* The strings are first transformed in vectors of occurences of k-shingles
@@ -84,14 +88,10 @@ public double similarity(String s1, String s2) {
8488
}
8589

8690
KShingling ks = new KShingling(this.k);
87-
ks.parse(s1);
88-
ks.parse(s2);
89-
90-
int[] v1 = ks.profileOf(s1);
91-
int[] v2 = ks.profileOf(s2);
92-
93-
return dotProduct(v1, v2) / (norm(v1) * norm(v2));
91+
HashMap<String, Integer> profile1 = ks.getProfile(s1);
92+
HashMap<String, Integer> profile2 = ks.getProfile(s2);
9493

94+
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
9595
}
9696

9797
public double distance(String s1, String s2) {
@@ -100,24 +100,31 @@ public double distance(String s1, String s2) {
100100

101101
/**
102102
* Compute the norm L2 : sqrt(Sum_i( v_i^2))
103-
* @param v
103+
* @param profile
104104
* @return L2 norm
105105
*/
106-
protected static double norm(int[] v) {
106+
protected static double norm(HashMap<String, Integer> profile) {
107107
double agg = 0;
108108

109-
for (int i = 0; i < v.length; i++) {
110-
agg += (v[i] * v[i]);
109+
for (int v : profile.values()) {
110+
agg += v * v;
111111
}
112112

113113
return Math.sqrt(agg);
114114
}
115115

116-
protected static double dotProduct(int[] v1, int[] v2) {
116+
protected static double dotProduct(HashMap<String, Integer> profile1,
117+
HashMap<String, Integer> profile2) {
118+
117119
double agg = 0;
120+
Set<String> union = new HashSet<String>();
121+
union.addAll(profile1.keySet());
122+
union.addAll(profile2.keySet());
118123

119-
for (int i = 0; i < v1.length; i++) {
120-
agg += (v1[i] * v2[i]);
124+
for (String key : union) {
125+
int v1 = profile1.containsKey(key) ? profile1.get(key) : 0;
126+
int v2 = profile2.containsKey(key) ? profile2.get(key) : 0;
127+
agg += v1 * v2;
121128
}
122129

123130
return agg;

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27+
import java.util.HashMap;
28+
import java.util.HashSet;
29+
import java.util.Set;
30+
2731
/**
2832
*
2933
* @author Thibault Debatty
@@ -62,26 +66,22 @@ public Jaccard() {
6266

6367
public double similarity(String s1, String s2) {
6468
KShingling ks = new KShingling(this.k);
65-
ks.parse(s1);
66-
ks.parse(s2);
67-
68-
boolean[] v1 = ks.booleanVectorOf(s1);
69-
boolean[] v2 = ks.booleanVectorOf(s2);
69+
return similarity(ks.getProfile(s1), ks.getProfile(s2));
70+
}
71+
72+
public double similarity(HashMap<String,Integer> profile1,
73+
HashMap<String,Integer> profile2) {
74+
Set<String> set1 = profile1.keySet();
75+
Set<String> set2 = profile2.keySet();
7076

71-
int inter = 0;
72-
int union = 0;
73-
for (int i = 0; i < v1.length; i++) {
74-
if (v1[i] || v2[i]) {
75-
union++;
76-
77-
if (v1[i] && v2[i]) {
78-
inter++;
79-
}
80-
}
81-
}
77+
Set union = new HashSet();
78+
union.addAll(set1);
79+
union.addAll(set2);
8280

83-
return (double) inter / union;
81+
Set inter = new HashSet(set1);
82+
inter.retainAll(set2);
8483

84+
return (double) inter.size() / union.size();
8585
}
8686

8787
public double distance(String s1, String s2) {

0 commit comments

Comments
 (0)