Skip to content

Commit d595d11

Browse files
committed
fixes issue #9 (int multiplication overflow)
1 parent ee2b6e5 commit d595d11

File tree

4 files changed

+63410
-10
lines changed

4 files changed

+63410
-10
lines changed

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,19 @@ public Cosine(final int k) {
4949
}
5050

5151
/**
52-
*
52+
*
5353
*/
5454
public Cosine() {
5555
super();
5656
}
5757

58-
public double similarity(String s1, String s2) {
58+
/**
59+
* Compute the cosine similarity between strings.
60+
* @param s1
61+
* @param s2
62+
* @return
63+
*/
64+
public final double similarity(final String s1, final String s2) {
5965

6066
if (s1.length() < k || s2.length() < k) {
6167
return 0;
@@ -64,7 +70,8 @@ public double similarity(String s1, String s2) {
6470
int[] profile1 = ks.getArrayProfile(s1);
6571
int[] profile2 = ks.getArrayProfile(s2);
6672

67-
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
73+
return dotProduct(profile1, profile2)
74+
/ (norm(profile1) * norm(profile2));
6875
}
6976

7077
/**
@@ -73,29 +80,39 @@ public double similarity(String s1, String s2) {
7380
* @param profile
7481
* @return L2 norm
7582
*/
76-
protected static double norm(int[] profile) {
83+
protected static double norm(final int[] profile) {
7784
double agg = 0;
7885

7986
for (int v : profile) {
80-
agg += v * v;
87+
agg += 1.0 * v * v;
8188
}
8289

8390
return Math.sqrt(agg);
8491
}
8592

86-
protected static double dotProduct(int[] profile1, int[] profile2) {
93+
protected static double dotProduct(
94+
final int[] profile1, final int[] profile2) {
95+
96+
// Make a copy to ensure both profiles have the same size
97+
// this is actually quite dirty and should be corrected!
8798
int length = Math.max(profile1.length, profile2.length);
88-
profile1 = java.util.Arrays.copyOf(profile1, length);
89-
profile2 = java.util.Arrays.copyOf(profile2, length);
99+
int[] copy1 = java.util.Arrays.copyOf(profile1, length);
100+
int[] copy2 = java.util.Arrays.copyOf(profile2, length);
90101

91102
double agg = 0;
92103
for (int i = 0; i < length; i++) {
93-
agg += profile1[i] * profile2[i];
104+
agg += 1.0 * copy1[i] * copy2[i];
94105
}
95106
return agg;
96107
}
97108

98-
public double distance(String s1, String s2) {
109+
/**
110+
* Return 1.0 - similarity.
111+
* @param s1
112+
* @param s2
113+
* @return
114+
*/
115+
public double distance(final String s1, final String s2) {
99116
return 1.0 - similarity(s1, s2);
100117
}
101118

src/test/java/info/debatty/java/stringsimilarity/CosineTest.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27+
import java.io.BufferedReader;
28+
import java.io.File;
29+
import java.io.FileReader;
30+
import java.io.IOException;
31+
import java.io.InputStream;
32+
import java.io.InputStreamReader;
2733
import org.junit.Test;
2834
import static org.junit.Assert.*;
2935

@@ -54,4 +60,38 @@ public final void testSmallString() {
5460
double result = instance.similarity("AB", "ABCE");
5561
assertEquals(0.0, result, 0.00001);
5662
}
63+
64+
@Test
65+
public final void testLargeString() throws IOException {
66+
67+
System.out.println("Test with large strings");
68+
Cosine cos = new Cosine();
69+
70+
// read from 2 text files
71+
String string1 = readResourceFile("71816-2.txt");
72+
String string2 = readResourceFile("11328-1.txt");
73+
double similarity = cos.similarity(string1, string2);
74+
75+
assertEquals(0.8115, similarity, 0.001);
76+
}
77+
78+
private static String readResourceFile(String file) throws IOException {
79+
80+
InputStream stream = Thread.currentThread()
81+
.getContextClassLoader()
82+
.getResourceAsStream(file);
83+
84+
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
85+
StringBuilder string_builder = new StringBuilder();
86+
String ls = System.getProperty("line.separator");
87+
String line = null;
88+
89+
while (( line = reader.readLine() ) != null ) {
90+
string_builder.append(line);
91+
string_builder.append(ls);
92+
}
93+
94+
string_builder.deleteCharAt(string_builder.length() - 1);
95+
return string_builder.toString();
96+
}
5797
}

0 commit comments

Comments
 (0)