Repository: commons-text Updated Branches: refs/heads/master a5ac07106 -> 6280d46c5
[SANDBOX-492] Create StringDistanceFrom class that contains a StringMetric and the "left" side string. This would have a method that accepts the "right" side string to test. This fixes #3 from GitHub. Thanks to Jonathan Baker. Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6280d46c Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6280d46c Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6280d46c Branch: refs/heads/master Commit: 6280d46c5d5e87e0491573efa969fcf4a6397cd7 Parents: a5ac071 Author: j--baker <j--ba...@users.noreply.github.com> Authored: Wed Mar 4 15:20:18 2015 -0500 Committer: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Committed: Mon Apr 13 00:48:45 2015 +1200 ---------------------------------------------------------------------- src/changes/changes.xml | 1 + .../text/similarity/StringMetricFrom.java | 111 +++++++++++++++++++ .../ParameterizedStringMetricFromTest.java | 92 +++++++++++++++ .../text/similarity/StringMetricFromTest.java | 71 ++++++++++++ 4 files changed, 275 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 4ac588a..45ac073 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -22,6 +22,7 @@ <body> <release version="1.0" date="tba" description="tba"> + <action issue="SANDBOX-492" type="fix" dev="kinow" due-to="Jonathan baker">Create StringDistanceFrom class that contains a StringMetric and the "left" side string. This would have a method that accepts the "right" side string to test.</action> <action issue="SANDBOX-490" type="add" dev="kinow">Add Cosine Similarity and Cosine Distance</action> <action issue="SANDBOX-493" type="fix" dev="kinow" due-to="Jonathan Baker">Change (R) StringMetric.compare(CS left, CS right) to "apply" so that it is consistent with BiFunction.</action> <action issue="SANDBOX-491" type="fix" dev="kinow" due-to="Jonathan Baker">Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance.</action> http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java new file mode 100644 index 0000000..3b2a871 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/StringMetricFrom.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * <p> + * This stores a {@link StringMetric} implementation and a {@link CharSequence} "left" string. + * The {@link #apply(CharSequence right)} method accepts the "right" string and invokes the + * comparison function for the pair of strings. + * </p> + * + * <p> + * The following is an example which finds the most similar string: + * </p> + * <pre> + * StringMetric<Integer> metric = new LevenshteinDistance(); + * String target = "Apache"; + * StringMetricFrom<Integer> metricFrom = + * new StringMetricFrom<Integer>(metric, target); + * String mostSimilar = null; + * Integer shortestDistance = null; + * + * for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { + * Integer distance = metricFrom.apply(test); + * if (shortestDistance == null || distance < shortestDistance) { + * shortestDistance = distance; + * mostSimilar = test; + * } + * } + * + * System.out.println("The string most similar to \"" + target + "\" " + * + "is \"" + mostSimilar + "\" because " + * + "its distance is only " + shortestDistance + "."); + * </pre> + * + * @param <R> This is the type of similarity score used by the StringMetric function. + */ +public class StringMetricFrom<R> { + + /** + * String metric. + */ + private final StringMetric<R> metric; + /** + * Left parameter used in distance function. + */ + private final CharSequence left; + + /** + * <p>This accepts the metric implementation and the "left" string.</p> + * + * @param metric This may not be null. + * @param left This may be null here, + * but the StringMetric#compare(CharSequence left, CharSequence right) + * implementation may not accept nulls. + */ + public StringMetricFrom(final StringMetric<R> metric, final CharSequence left) { + if (metric == null) { + throw new IllegalArgumentException("The metric may not be null."); + } + + this.metric = metric; + this.left = left; + } + + /** + * <p> + * This compares "left" field against the "right" parameter + * using the "metric" implementation. + * </p> + * + * @param right the second CharSequence + * @return the similarity score between two CharSequences + */ + public R apply(CharSequence right) { + return metric.apply(left, right); + } + + /** + * Gets the left parameter. + * + * @return the left parameter + */ + public CharSequence getLeft() { + return left; + } + + /** + * Gets the right parameter. + * + * @return the right parameter + */ + public StringMetric<R> getMetric() { + return metric; + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java new file mode 100644 index 0000000..36c03bb --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedStringMetricFromTest.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; + +import java.util.Arrays; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}. + * + * @param <R> The {@link StringMetric} return type. + */ +@RunWith(Parameterized.class) +public class ParameterizedStringMetricFromTest<R> { + + private final StringMetric<R> metric; + private final CharSequence left; + private final CharSequence right; + private final R distance; + + public ParameterizedStringMetricFromTest( + final StringMetric<R> metric, + final CharSequence left, final CharSequence right, + final R distance) { + + this.metric = metric; + this.left = left; + this.right = right; + this.distance = distance; + } + + @Parameters + public static Iterable<Object[]> parameters() { + return Arrays.asList( new Object[][] { + + /* TODO: When SANDBOX-491 is ready, add a few FuzzyScore tests. */ + + { new HammingDistance(), "Sam I am.", "Ham I am.", 1 }, + { new HammingDistance(), "Japtheth, Ham, Shem", "Japtheth, HAM, Shem", 2 }, + { new HammingDistance(), "Hamming", "Hamming", 0 }, + + { new JaroWrinklerDistance(), "elephant", "hippo", 0.44 }, + { new JaroWrinklerDistance(), "hippo", "elephant", 0.44 }, + { new JaroWrinklerDistance(), "hippo", "zzzzzzzz", 0.0 }, + + /* TODO: When SANDBOX-491 is ready, add a few limited/threshold tests. */ + { new LevenshteinDistance(), "Apache", "a patchy", 4 }, + { new LevenshteinDistance(), "go", "no go", 3 }, + { new LevenshteinDistance(), "go", "go", 0 }, + + { + new StringMetric<Boolean>() { + public Boolean apply(CharSequence left, CharSequence right) { + return left == right || (left != null && left.equals(right)); + } + }, + "Bob's your uncle.", + "Every good boy does fine.", + false + } + + } ); + } + + @Test + public void test() { + StringMetricFrom<R> metricFrom = new StringMetricFrom<R>(metric, left); + assertThat(metricFrom.apply(right), equalTo(distance)); + } + +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/6280d46c/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java new file mode 100644 index 0000000..e268366 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/StringMetricFromTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; + +import org.junit.Test; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.StringMetricFrom}. + */ +public class StringMetricFromTest { + + @Test + public void testEquivalence() { + StringMetric<Integer> metric = new LevenshteinDistance(); + String left = "Apache"; + String right = "a patchy"; + Integer distance = 4; + StringMetricFrom<Integer> metricFrom = new StringMetricFrom<Integer>(metric, left); + + assertThat(metricFrom.apply(right), equalTo(distance)); + assertThat(metricFrom.apply(right), equalTo(metric.apply(left, right))); + } + + @Test + public void testJavadocExample() { + StringMetric<Integer> metric = new LevenshteinDistance(); + String target = "Apache"; + StringMetricFrom<Integer> metricFrom = + new StringMetricFrom<Integer>(metric, target); + String mostSimilar = null; + Integer shortestDistance = null; + + for (String test : new String[] { "Appaloosa", "a patchy", "apple" }) { + Integer distance = metricFrom.apply(test); + if (shortestDistance == null || distance < shortestDistance) { + shortestDistance = distance; + mostSimilar = test; + } + } + + System.out.println("The string most similar to \"" + target + "\" " + + "is \"" + mostSimilar + "\" because " + + "its distance is only " + shortestDistance + "."); + + assertThat(mostSimilar, equalTo("a patchy")); + assertThat(shortestDistance, equalTo(4)); + } + + @Test(expected = IllegalArgumentException.class) + public void testMissingMetric() { + new StringMetricFrom<Number>(null, "no go"); + } + +}