tdebatty
diff --git a/‎README.md
Lines changed: 31 additions & 23 deletions b/‎README.md
Lines changed: 31 additions & 23 deletions
diff --git a/‎src/main/java/info/debatty/java/stringsimilarity/Cosine.java
Lines changed: 21 additions & 14 deletions b/‎src/main/java/info/debatty/java/stringsimilarity/Cosine.java
Lines changed: 21 additions & 14 deletions
diff --git a/‎src/main/java/info/debatty/java/stringsimilarity/Jaccard.java
Lines changed: 17 additions & 17 deletions b/‎src/main/java/info/debatty/java/stringsimilarity/Jaccard.java
Lines changed: 17 additions & 17 deletions
@@ -84,10 +84,10 @@ public class MyApp {
     public static void main(String[] args) {
         Damerau d = new Damerau();
 
-        // One substitution
+        // One transposition
         System.out.println(d.absoluteDistance("ABCDEF", "ABDCEF"));
 
-        // Substitution of 2 characters that are far from each other
+        // Transposition of 2 characters that are far from each other
         // => 1 deletion + 1 insertion
         System.out.println(d.absoluteDistance("ABCDEF", "BCDAEF"));
 
@@ -155,12 +155,34 @@ public class MyApp {
 }
 ```
 
+## N-Gram similarity (Kondrak)
+
+N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
+
+http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
+
+The algorithm uses affixing with special character '\n' two increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longer word.
+
+```java
+import info.debatty.java.stringsimilarity.*;
+
+public class MyApp {
+
+    public static void main(String[] args) {
+        NGram twogram = new NGram(2);
+
+        // Should be 0.41666
+        System.out.println(twogram.distance("ABCD", "ABTUIO"));
+    }
+}
+```
+
 ## Q-Gram
 
 A-gram similarity and distance, as defined by Ukkonen in "Approximate string-matching with q-grams and maximal matches"
 http://www.sciencedirect.com/science/article/pii/0304397592901434
 
-The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences of each k-shingle). Q-gram distance is a lower bound on Levenshtein distance, but can be computed in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
+The distance between two strings is defined as the L1 norm of the difference of their profiles (the number of occurences of each n-gram). Q-gram distance is a lower bound on Levenshtein distance, but can be computed in O(|A| + |B|), where Levenshtein requires O(|A|.|B|)
 
 ```java
 import info.debatty.java.stringsimilarity.*;
@@ -182,27 +204,13 @@ public class MyApp {
 }
 ```
 
-## N-Gram similarity (Kondrak)
-
-N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
-
-http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
-
-The algorithm uses affixing with special character '\n' two increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longer word.
-
-```java
-import info.debatty.java.stringsimilarity.*;
+## Cosine similarity
+Like Q-Gram similarity, the profile of each input string is first computed (the number of occurences of each n-gram). The two input strings are thus considered as vectors in the space of n-grams. The similarity between the two strings is the cosine of the angle between these two vectors, and is computed as V1 . V2 / (|V1| * |V2|)
 
-public class MyApp {
-
-    public static void main(String[] args) {
-        NGram twogram = new NGram(2);
-
-        // Should be 0.41666
-        System.out.println(twogram.distance("ABCD", "ABTUIO"));
-    }
-}
-```
+## Jaccard index
+Like Q-Gram similarity, the input strings are first converted into sets of n-grams (sequences of n characters, also called k-shingles), but this time the cardinality of each n-gram is not taken into account. Each input string is simply a set of n-grams. The Jaccard index is then computed as |A inter B| / |A union B|.
 
+## Sorensen-Dice coefficient
+Similart to Jaccard index, but this time the similarity is computed as 2 * |A inter B| / (|A| + |B|).
 
 
@@ -24,6 +24,10 @@
 
 package info.debatty.java.stringsimilarity;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * Implements Cosine Similarity.
  * The strings are first transformed in vectors of occurences of k-shingles 
@@ -84,14 +88,10 @@ public double similarity(String s1, String s2) {
         }
 
         KShingling ks = new KShingling(this.k);
-        ks.parse(s1);
-        ks.parse(s2);
-        
-        int[] v1 = ks.profileOf(s1);
-        int[] v2 = ks.profileOf(s2);
-        
-        return dotProduct(v1, v2) / (norm(v1) * norm(v2));
+        HashMap<String, Integer> profile1 = ks.getProfile(s1);
+        HashMap<String, Integer> profile2 = ks.getProfile(s2);
 
+        return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));   
     }
 
     public double distance(String s1, String s2) {
@@ -100,24 +100,31 @@ public double distance(String s1, String s2) {
 
     /**
      * Compute the norm L2 : sqrt(Sum_i( v_i^2))
-     * @param v
+     * @param profile
      * @return L2 norm
      */
-    protected static double norm(int[] v) {
+    protected static double norm(HashMap<String, Integer> profile) {
         double agg = 0;
 
-        for (int i = 0; i < v.length; i++) {
-            agg += (v[i] * v[i]);
+        for (int v : profile.values()) {
+            agg += v * v;
         }
 
         return Math.sqrt(agg);
     }
 
-    protected static double dotProduct(int[] v1, int[] v2) {
+    protected static double dotProduct(HashMap<String, Integer> profile1,
+            HashMap<String, Integer> profile2) {
+        
         double agg = 0;
+        Set<String> union = new HashSet<String>();
+        union.addAll(profile1.keySet());
+        union.addAll(profile2.keySet());
 
-        for (int i = 0; i < v1.length; i++) {
-            agg += (v1[i] * v2[i]);
+        for (String key : union) {
+            int v1 = profile1.containsKey(key) ? profile1.get(key) : 0;
+            int v2 = profile2.containsKey(key) ? profile2.get(key) : 0;
+            agg += v1 * v2;
         }
 
         return agg;
 
@@ -24,6 +24,10 @@
 
 package info.debatty.java.stringsimilarity;
 
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
+
 /**
  * 
  * @author Thibault Debatty
@@ -62,26 +66,22 @@ public Jaccard() {
 
     public double similarity(String s1, String s2) {
         KShingling ks = new KShingling(this.k);
-        ks.parse(s1);
-        ks.parse(s2);
-        
-        boolean[] v1 = ks.booleanVectorOf(s1);
-        boolean[] v2 = ks.booleanVectorOf(s2);
+        return similarity(ks.getProfile(s1), ks.getProfile(s2));
+    }
+    
+    public double similarity(HashMap<String,Integer> profile1,
+            HashMap<String,Integer> profile2) {
+        Set<String> set1 = profile1.keySet();
+        Set<String> set2 = profile2.keySet();
 
-        int inter = 0;
-        int union = 0;
-        for (int i = 0; i < v1.length; i++) {
-            if (v1[i] || v2[i]) {
-                union++;
-                
-                if (v1[i] && v2[i]) {
-                    inter++;
-                }
-            }
-        }
+        Set union = new HashSet();
+        union.addAll(set1);
+        union.addAll(set2);
 
-        return (double) inter / union;
+        Set inter = new HashSet(set1);
+        inter.retainAll(set2);
 
+        return (double) inter.size() / union.size();
     }
 
     public double distance(String s1, String s2) {