Repository: commons-text Updated Branches: refs/heads/NEW-METRICS 0404dbf4b -> ff1959c84
Initial implementation of the cosine distance for strings (not sequences) Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/ff1959c8 Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/ff1959c8 Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/ff1959c8 Branch: refs/heads/NEW-METRICS Commit: ff1959c84dce2eac6f2e8432623d2a2a270a5f32 Parents: 0404dbf Author: Bruno P. Kinoshita <ki...@apache.org> Authored: Sat Feb 21 22:22:28 2015 -0200 Committer: Bruno P. Kinoshita <ki...@apache.org> Committed: Sat Feb 21 22:22:28 2015 -0200 ---------------------------------------------------------------------- .../text/similarity/CosineSimilarity.java | 48 +++++++++++++++++- .../text/similarity/CosineSimilarityTest.java | 51 ++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java index 4589c2d..ca9d087 100644 --- a/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java +++ b/src/main/java/org/apache/commons/text/similarity/CosineSimilarity.java @@ -16,6 +16,52 @@ */ package org.apache.commons.text.similarity; -public class CosineSimilarity { +/** + * <p>Measures the Cosine similarity of two CharSequences. It treats the CharSequences as + * two vectors of an inner product space and compares the angle between them.</p> + * + * <p> + * For further explanation about the Cosine Similarity, take a look at its + * Wikipedia page at http://en.wikipedia.org/wiki/Cosine_similarity. + * </p> + * + * @since 0.1 + */ +public class CosineSimilarity implements StringMetric<Double> { + + @Override + public Double compare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("String parameters must not be null"); + } + long dotProduct = dot(left, right); + double d1 = 0.0d; + for (int i = 0; i < left.length(); ++i) { + d1 += Math.pow(((int) left.charAt(i)), 2); + } + double d2 = 0.0d; + for (int i = 0; i < right.length(); ++i) { + d2 += Math.pow(((int) right.charAt(i)), 2); + } + double cosineSimilarity = dotProduct / (double) (Math.sqrt(d1) * Math.sqrt(d2)); + return cosineSimilarity; + } + + /** + * Computes the dot product of two CharSequences. It ignores remaining characters. It means + * that if a string is longer than other, then a smaller part of it will be used to compute + * the dot product. + * + * @param left left string + * @param right right string + * @return the dot product + */ + protected long dot(CharSequence left, CharSequence right) { + long dotProduct = 0; + for (int i = 0; i < left.length() && i < right.length(); ++i) { + dotProduct += (((int) left.charAt(i)) * ((int) right.charAt(i))); + } + return dotProduct; + } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/ff1959c8/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java new file mode 100644 index 0000000..aa08057 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/CosineSimilarityTest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import java.math.BigDecimal; +import java.math.RoundingMode; + +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.CosineSimilarity}. + */ +public class CosineSimilarityTest { + + private static CosineSimilarity cosineSimilarity; + + @BeforeClass + public static void setUp() { + cosineSimilarity = new CosineSimilarity(); + } + + @Test + public void testCosineSimilarity() { + assertEquals(Double.valueOf(0.62d), roundValue(cosineSimilarity.compare("ABCDE", "AB"))); + assertEquals(Double.valueOf(1.00d), roundValue(cosineSimilarity.compare("AB", "AB"))); + } + + // --- Utility methods + + private Double roundValue(Double value) { + return (Double) new BigDecimal(value).setScale(2, RoundingMode.HALF_UP).doubleValue(); + } + +}