Repository: commons-text
Updated Branches:
  refs/heads/NEW-METRICS 0404dbf4b -> ff1959c84


Initial implementation of the cosine distance for strings (not sequences)


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/ff1959c8
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/ff1959c8
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/ff1959c8

Branch: refs/heads/NEW-METRICS
Commit: ff1959c84dce2eac6f2e8432623d2a2a270a5f32
Parents: 0404dbf
Author: Bruno P. Kinoshita <ki...@apache.org>
Authored: Sat Feb 21 22:22:28 2015 -0200
Committer: Bruno P. Kinoshita <ki...@apache.org>
Committed: Sat Feb 21 22:22:28 2015 -0200

----------------------------------------------------------------------
 .../text/similarity/CosineSimilarity.java       | 48 +++++++++++++++++-
 .../text/similarity/CosineSimilarityTest.java   | 51 ++++++++++++++++++++
 2 files changed, 98 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java 
b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
index 4589c2d..ca9d087 100644
--- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java
@@ -16,6 +16,52 @@
  */
 package org.apache.commons.text.similarity;
 
-public class CosineSimilarity {
+/**
+ * <p>Measures the Cosine similarity of two CharSequences. It treats the 
CharSequences as
+ * two vectors of an inner product space and compares the angle between 
them.</p>
+ *
+ * <p>
+ * For further explanation about the Cosine Similarity, take a look at its
+ * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity.
+ * </p>
+ *
+ * @since 0.1
+ */
+public class CosineSimilarity implements StringMetric<Double> {
+
+    @Override
+    public Double compare(CharSequence left, CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("String parameters must not be 
null");
+        }
+        long dotProduct = dot(left, right);
+        double d1 = 0.0d;
+        for (int i = 0; i < left.length(); ++i) {
+            d1 += Math.pow(((int) left.charAt(i)), 2);
+        }
+        double d2 = 0.0d;
+        for (int i = 0; i < right.length(); ++i) {
+            d2 += Math.pow(((int) right.charAt(i)), 2);
+        }
+        double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) * 
Math.sqrt(d2));
+        return cosineSimilarity;
+    }
+
+    /**
+     * Computes the dot product of two CharSequences. It ignores remaining 
characters. It means
+     * that if a string is longer than other, then a smaller part of it will 
be used to compute
+     * the dot product.
+     * 
+     * @param left left string
+     * @param right right string
+     * @return the dot product
+     */
+    protected long dot(CharSequence left, CharSequence right) {
+        long dotProduct = 0;
+        for (int i = 0; i < left.length() && i < right.length(); ++i) {
+            dotProduct += (((int) left.charAt(i)) * ((int) right.charAt(i)));
+        }
+        return dotProduct;
+    }
 
 }

http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java 
b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
new file mode 100644
index 0000000..aa08057
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.junit.Assert.assertEquals;
+
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}.
+ */
+public class CosineSimilarityTest {
+
+    private static CosineSimilarity cosineSimilarity;
+
+    @BeforeClass
+    public static void setUp() {
+        cosineSimilarity = new CosineSimilarity();
+    }
+
+    @Test
+    public void testCosineSimilarity() {
+        assertEquals(Double.valueOf(0.62d), 
roundValue(cosineSimilarity.compare("ABCDE", "AB")));
+        assertEquals(Double.valueOf(1.00d), 
roundValue(cosineSimilarity.compare("AB", "AB")));
+    }
+
+    // --- Utility methods
+
+    private Double roundValue(Double value) {
+        return (Double) new BigDecimal(value).setScale(2, 
RoundingMode.HALF_UP).doubleValue();
+    }
+
+}

Reply via email to