Repository: commons-text Updated Branches: refs/heads/master d9d330542 -> 75cdc00af
SANDBOX-491: Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance. This fixes #1 from github. Thanks to Jonathan Baker. Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/75cdc00a Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/75cdc00a Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/75cdc00a Branch: refs/heads/master Commit: 75cdc00afc674aca39a4fca4c1745d029da43648 Parents: d9d3305 Author: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Authored: Thu Mar 19 22:28:44 2015 -0300 Committer: Bruno P. Kinoshita <brunodepau...@yahoo.com.br> Committed: Thu Mar 19 22:30:02 2015 -0300 ---------------------------------------------------------------------- src/changes/changes.xml | 1 + .../commons/text/similarity/FuzzyScore.java | 36 +-- .../text/similarity/LevenshteinDistance.java | 229 +++++++++++++++---- .../commons/text/similarity/FuzzyScoreTest.java | 47 ++-- .../similarity/JaroWrinklerDistanceTest.java | 2 +- .../similarity/LevenshteinDistanceTest.java | 113 ++------- .../ParameterizedLevenshteinDistanceTest.java | 125 ++++++++++ 7 files changed, 372 insertions(+), 181 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2f87c53..fa6417f 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -22,6 +22,7 @@ <body> <release version="1.0" date="tba" description="tba"> + <action issue="SANDBOX-491" type="fix" dev="kinow" due-to="Jonathan Baker">Allow extra information (e.g. Levenshtein threshold) to be stored as (final) fields in the StringMetric instance.</action> <action issue="SANDBOX-486" type="add" dev="kinow">Port Myers algorithm from [collections]</action> <action issue="SANDBOX-485" type="add" dev="kinow">Add Hamming distance</action> <action issue="SANDBOX-483" type="add" dev="kinow" due-to="britter">Incorporate String algorithms from Commons Lang</action> http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java index 3e72d05..3cf6df2 100644 --- a/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java +++ b/src/main/java/org/apache/commons/text/similarity/FuzzyScore.java @@ -33,21 +33,22 @@ import java.util.Locale; */ public class FuzzyScore implements StringMetric<Integer> { + private final Locale locale; + + /** - * <p> - * Find the Fuzzy Score which indicates the similarity score between two - * Strings. This method uses the default locale. - * </p> + * <p>This returns a {@link Locale}-specific {@link FuzzyScore}.</p> * - * @param term a full term that should be matched against, must not be null - * @param query the query that will be matched against a term, must not be - * null - * @return result score - * @throws IllegalArgumentException if either String input {@code null} + * @param locale The string matching logic is case insensitive. + A {@link Locale} is necessary to normalize both Strings to lower case. + * @throws IllegalArgumentException + * This is thrown if the {@link Locale} parameter is {@code null}. */ - @Override - public Integer compare(CharSequence term, CharSequence query) { - return compare(term, query, Locale.getDefault()); + public FuzzyScore(final Locale locale) { + if (locale == null) { + throw new IllegalArgumentException("Locale must not be null"); + } + this.locale = locale; } /** @@ -70,17 +71,14 @@ public class FuzzyScore implements StringMetric<Integer> { * @param term a full term that should be matched against, must not be null * @param query the query that will be matched against a term, must not be * null - * @param locale This string matching logic is case insensitive. A locale is - * necessary to normalize both Strings to lower case. * @return result score * @throws IllegalArgumentException if either String input {@code null} or * Locale input {@code null} */ - public Integer compare(CharSequence term, CharSequence query, Locale locale) { + @Override + public Integer compare(CharSequence term, CharSequence query) { if (term == null || query == null) { throw new IllegalArgumentException("Strings must not be null"); - } else if (locale == null) { - throw new IllegalArgumentException("Locale must not be null"); } // fuzzy logic is case insensitive. We normalize the Strings to lower @@ -130,4 +128,8 @@ public class FuzzyScore implements StringMetric<Integer> { return score; } + public Locale getLocale() { + return locale; + } + } http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java index 4bc2e8a..38e5f14 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDistance.java @@ -33,27 +33,52 @@ import java.util.Arrays; */ public class LevenshteinDistance implements StringMetric<Integer> { + private static final LevenshteinDistance DEFAULT_INSTANCE = new LevenshteinDistance(); + + private final Integer threshold; + /** - * Find the Levenshtein distance between two Strings. - * - * <p>A higher score indicates a greater distance.</p> - * * <p> - * The previous implementation of the Levenshtein distance algorithm was - * from <a - * href="http://www.merriampark.com/ld.htm">http://www.merriampark.com - * /ld.htm</a> + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. * </p> * + * @see {@link #getDefaultInstance()} + */ + public LevenshteinDistance() { + this(null); + } + + /** * <p> - * Chas Emerick has written an implementation in Java, which avoids an - * OutOfMemoryError which can occur when my Java implementation is used with - * very large strings.<br> - * This implementation of the Levenshtein distance algorithm is from <a - * href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ - * ldjava.htm</a> + * If the threshold is not null, distance calculations will be limited to a maximum length. + * If the threshold is null, the unlimited version of the algorithm will be used. * </p> * + * @param threshold + * If this is null then distances calculations will not be limited. + * This may not be negative. + */ + public LevenshteinDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * * <pre> * distance.compare(null, *) = IllegalArgumentException * distance.compare(*, null) = IllegalArgumentException @@ -70,12 +95,23 @@ public class LevenshteinDistance implements StringMetric<Integer> { * * @param left the first string, must not be null * @param right the second string, must not be null - * @return result distance + * @return result distance, or -1 * @throws IllegalArgumentException if either String input {@code null} */ - @Override public Integer compare(CharSequence left, CharSequence right) { - return compare(left, right, Integer.MAX_VALUE); + if (threshold != null) { + return limitedCompare(left, right, threshold); + } else { + return unlimitedCompare(left, right); + } + } + + public static LevenshteinDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + public Integer getThreshold() { + return threshold; } /** @@ -91,27 +127,25 @@ public class LevenshteinDistance implements StringMetric<Integer> { * </p> * * <pre> - * distance.compare(null, *, *) = IllegalArgumentException - * distance.compare(*, null, *) = IllegalArgumentException - * distance.compare(*, *, -1) = IllegalArgumentException - * distance.compare("","", 0) = 0 - * distance.compare("aaapppp", "", 8) = 7 - * distance.compare("aaapppp", "", 7) = 7 - * distance.compare("aaapppp", "", 6)) = -1 - * distance.compare("elephant", "hippo", 7) = 7 - * distance.compare("elephant", "hippo", 6) = -1 - * distance.compare("hippo", "elephant", 7) = 7 - * distance.compare("hippo", "elephant", 6) = -1 + * limitedCompare(null, *, *) = IllegalArgumentException + * limitedCompare(*, null, *) = IllegalArgumentException + * limitedCompare(*, *, -1) = IllegalArgumentException + * limitedCompare("","", 0) = 0 + * limitedCompare("aaapppp", "", 8) = 7 + * limitedCompare("aaapppp", "", 7) = 7 + * limitedCompare("aaapppp", "", 6)) = -1 + * limitedCompare("elephant", "hippo", 7) = 7 + * limitedCompare("elephant", "hippo", 6) = -1 + * limitedCompare("hippo", "elephant", 7) = 7 + * limitedCompare("hippo", "elephant", 6) = -1 * </pre> * * @param left the first string, must not be null * @param right the second string, must not be null * @param threshold the target threshold, must not be negative - * @return result distance - * @throws IllegalArgumentException if either String input {@code null} or - * negative threshold + * @return result distance, or -1 */ - public Integer compare(CharSequence left, CharSequence right, int threshold) { + private static int limitedCompare(CharSequence left, CharSequence right, int threshold) { if (left == null || right == null) { throw new IllegalArgumentException("Strings must not be null"); } @@ -124,7 +158,7 @@ public class LevenshteinDistance implements StringMetric<Integer> { * equal to the threshold value, returning -1 if it's greater. The * advantage is performance: unbounded distance is O(nm), but a bound of * k allows us to reduce it to O(km) time by only computing a diagonal - * stripe of width 2k + 1 of the cost table. It is also possible to use + * stripe of width 2k + 1 of the cost table. It is also possible to use* this to compute the unbounded Levenshtein distance by starting the * this to compute the unbounded Levenshtein distance by starting the * threshold at 1 and doubling each time until the distance is found; * this is O(dm), where d is the distance. @@ -143,8 +177,16 @@ public class LevenshteinDistance implements StringMetric<Integer> { * and our threshold is 1. In this case we're going to walk a stripe of * length 3. The matrix would look like so: * - * 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | - * | | |#|#| 6 | | | | |#| 7 | | | | | | + * <pre> + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + * </pre> * * Note how the stripe leads off the table as there is no possible way * to turn a string of length 5 into one of length 7 in edit distance of @@ -161,8 +203,8 @@ public class LevenshteinDistance implements StringMetric<Integer> { * some discussion. */ - int n = left.length(); // length of s - int m = right.length(); // length of t + int n = left.length(); // length of left + int m = right.length(); // length of right // if one string is empty, the edit distance is necessarily the length // of the other @@ -197,7 +239,7 @@ public class LevenshteinDistance implements StringMetric<Integer> { // iterates through t for (int j = 1; j <= m; j++) { - final char t_j = right.charAt(j - 1); // jth character of t + final char right_j = right.charAt(j - 1); // jth character of right d[0] = j; // compute stripe indices, constrain to array size @@ -218,7 +260,7 @@ public class LevenshteinDistance implements StringMetric<Integer> { // iterates through [min, max] in s for (int i = min; i <= max; i++) { - if (left.charAt(i - 1) == t_j) { + if (left.charAt(i - 1) == right_j) { // diagonally left and up d[i] = p[i - 1]; } else { @@ -243,4 +285,113 @@ public class LevenshteinDistance implements StringMetric<Integer> { return -1; } + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * unlimitedCompare(null, *) = IllegalArgumentException + * unlimitedCompare(*, null) = IllegalArgumentException + * unlimitedCompare("","") = 0 + * unlimitedCompare("","a") = 1 + * unlimitedCompare("aaapppp", "") = 7 + * unlimitedCompare("frog", "fog") = 1 + * unlimitedCompare("fly", "ant") = 3 + * unlimitedCompare("elephant", "hippo") = 7 + * unlimitedCompare("hippo", "elephant") = 7 + * unlimitedCompare("hippo", "zzzzzzzz") = 8 + * unlimitedCompare("hello", "hallo") = 1 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static int unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, + we maintain two single-dimensional arrays of length s.length() + 1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + if (n == 0) { + return m; + } else if (m == 0) { + return n; + } + + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + } + + int p[] = new int[n + 1]; //'previous' cost array, horizontally + int d[] = new int[n + 1]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d + + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right + + char right_j; // jth character of right + + int cost; // cost + + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + right_j = right.charAt(j - 1); + d[0] = j; + + for (i = 1; i <= n; i++) { + cost = left.charAt(i - 1) == right_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return p[n]; + } + } http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java index b2fab14..88778fc 100644 --- a/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java +++ b/src/test/java/org/apache/commons/text/similarity/FuzzyScoreTest.java @@ -20,56 +20,45 @@ import static org.junit.Assert.assertEquals; import java.util.Locale; -import org.junit.BeforeClass; import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.FuzzyScore}. + * Unit tests for {@link org.apache.commons.text.similarity.FuzzyScore}. */ public class FuzzyScoreTest { - private static FuzzyScore score; - - @BeforeClass - public static void setUp() { - score = new FuzzyScore(); - } + private static final FuzzyScore ENGLISH_SCORE = new FuzzyScore(Locale.ENGLISH); @Test public void testGetFuzzyScore() throws Exception { - assertEquals(0, (int) score.compare("", "", Locale.ENGLISH)); - assertEquals(0, - (int) score.compare("Workshop", "b", Locale.ENGLISH)); - assertEquals(1, - (int) score.compare("Room", "o", Locale.ENGLISH)); - assertEquals(1, - (int) score.compare("Workshop", "w", Locale.ENGLISH)); - assertEquals(2, - (int) score.compare("Workshop", "ws", Locale.ENGLISH)); - assertEquals(4, - (int) score.compare("Workshop", "wo", Locale.ENGLISH)); - assertEquals(3, (int) score.compare( - "Apache Software Foundation", "asf", Locale.ENGLISH)); + assertEquals(0, (int) ENGLISH_SCORE.compare("", "")); + assertEquals(0, (int) ENGLISH_SCORE.compare("Workshop", "b")); + assertEquals(1, (int) ENGLISH_SCORE.compare("Room", "o")); + assertEquals(1, (int) ENGLISH_SCORE.compare("Workshop", "w")); + assertEquals(2, (int) ENGLISH_SCORE.compare("Workshop", "ws")); + assertEquals(4, (int) ENGLISH_SCORE.compare("Workshop", "wo")); + assertEquals(3, (int) ENGLISH_SCORE.compare( + "Apache Software Foundation", "asf")); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_NullNullNull() throws Exception { - score.compare(null, null, null); + public void testGetFuzzyScore_StringNullLocale() throws Exception { + ENGLISH_SCORE.compare("not null", null); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_StringNullLoclae() throws Exception { - score.compare(" ", null, Locale.ENGLISH); + public void testGetFuzzyScore_NullStringLocale() throws Exception { + ENGLISH_SCORE.compare(null, "not null"); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_NullStringLocale() throws Exception { - score.compare(null, "clear", Locale.ENGLISH); + public void testGetFuzzyScore_NullNullLocale() throws Exception { + ENGLISH_SCORE.compare(null, null); } @Test(expected = IllegalArgumentException.class) - public void testGetFuzzyScore_StringStringNull() throws Exception { - score.compare(" ", "clear", null); + public void testMissingLocale() throws Exception { + FuzzyScore score = new FuzzyScore((Locale) null); } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java index 73e18f7..7050b05 100644 --- a/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/JaroWrinklerDistanceTest.java @@ -22,7 +22,7 @@ import org.junit.BeforeClass; import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.JaroWrinklerDistance}. + * Unit tests for {@link org.apache.commons.text.similarity.JaroWrinklerDistance}. */ public class JaroWrinklerDistanceTest { http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java index 1d4e6fa..814677d 100644 --- a/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDistanceTest.java @@ -18,129 +18,52 @@ package org.apache.commons.text.similarity; import static org.junit.Assert.assertEquals; -import org.junit.BeforeClass; import org.junit.Test; /** - * Unit tests for {@link org.apache.commons.text.LevenshteinDistance}. + * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}. */ public class LevenshteinDistanceTest { - private static LevenshteinDistance distance; - - @BeforeClass - public static void setUp() { - distance = new LevenshteinDistance(); - } + private static final LevenshteinDistance UNLIMITED_DISTANCE = new LevenshteinDistance(); @Test public void testGetLevenshteinDistance_StringString() { - assertEquals(0, (int) distance.compare("", "")); - assertEquals(1, (int) distance.compare("", "a")); - assertEquals(7, (int) distance.compare("aaapppp", "")); - assertEquals(1, (int) distance.compare("frog", "fog")); - assertEquals(3, (int) distance.compare("fly", "ant")); - assertEquals(7, (int) distance.compare("elephant", "hippo")); - assertEquals(7, (int) distance.compare("hippo", "elephant")); - assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz")); - assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo")); - assertEquals(1, (int) distance.compare("hello", "hallo")); + assertEquals(0, (int) UNLIMITED_DISTANCE.compare("", "")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("", "a")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("aaapppp", "")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("frog", "fog")); + assertEquals(3, (int) UNLIMITED_DISTANCE.compare("fly", "ant")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("elephant", "hippo")); + assertEquals(7, (int) UNLIMITED_DISTANCE.compare("hippo", "elephant")); + assertEquals(8, (int) UNLIMITED_DISTANCE.compare("hippo", "zzzzzzzz")); + assertEquals(8, (int) UNLIMITED_DISTANCE.compare("zzzzzzzz", "hippo")); + assertEquals(1, (int) UNLIMITED_DISTANCE.compare("hello", "hallo")); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullString() throws Exception { - distance.compare("a", null); + UNLIMITED_DISTANCE.compare("a", null); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNull() throws Exception { - distance.compare(null, "a"); - } - - @Test - public void testGetLevenshteinDistance_StringStringInt() { - // empty strings - assertEquals(0, (int) distance.compare("", "", 0)); - assertEquals(7, (int) distance.compare("aaapppp", "", 8)); - assertEquals(7, (int) distance.compare("aaapppp", "", 7)); - assertEquals(-1, (int) distance.compare("aaapppp", "", 6)); - - // unequal strings, zero threshold - assertEquals(-1, (int) distance.compare("b", "a", 0)); - assertEquals(-1, (int) distance.compare("a", "b", 0)); - - // equal strings - assertEquals(0, (int) distance.compare("aa", "aa", 0)); - assertEquals(0, (int) distance.compare("aa", "aa", 2)); - - // same length - assertEquals(-1, (int) distance.compare("aaa", "bbb", 2)); - assertEquals(3, (int) distance.compare("aaa", "bbb", 3)); - - // big stripe - assertEquals(6, (int) distance.compare("aaaaaa", "b", 10)); - - // distance less than threshold - assertEquals(7, (int) distance.compare("aaapppp", "b", 8)); - assertEquals(3, (int) distance.compare("a", "bbb", 4)); - - // distance equal to threshold - assertEquals(7, (int) distance.compare("aaapppp", "b", 7)); - assertEquals(3, (int) distance.compare("a", "bbb", 3)); - - // distance greater than threshold - assertEquals(-1, (int) distance.compare("a", "bbb", 2)); - assertEquals(-1, (int) distance.compare("bbb", "a", 2)); - assertEquals(-1, (int) distance.compare("aaapppp", "b", 6)); - - // stripe runs off array, strings not similar - assertEquals(-1, (int) distance.compare("a", "bbb", 1)); - assertEquals(-1, (int) distance.compare("bbb", "a", 1)); - - // stripe runs off array, strings are similar - assertEquals(-1, (int) distance.compare("12345", "1234567", 1)); - assertEquals(-1, (int) distance.compare("1234567", "12345", 1)); - - // old getLevenshteinDistance test cases - assertEquals(1, (int) distance.compare("frog", "fog", 1)); - assertEquals(3, (int) distance.compare("fly", "ant", 3)); - assertEquals(7, (int) distance.compare("elephant", "hippo", 7)); - assertEquals(-1, (int) distance.compare("elephant", "hippo", 6)); - assertEquals(7, (int) distance.compare("hippo", "elephant", 7)); - assertEquals(-1, (int) distance.compare("hippo", "elephant", 6)); - assertEquals(8, (int) distance.compare("hippo", "zzzzzzzz", 8)); - assertEquals(8, (int) distance.compare("zzzzzzzz", "hippo", 8)); - assertEquals(1, (int) distance.compare("hello", "hallo", 1)); - - assertEquals(1, - (int) distance.compare("frog", "fog", Integer.MAX_VALUE)); - assertEquals(3, (int) distance.compare("fly", "ant", Integer.MAX_VALUE)); - assertEquals(7, - (int) distance.compare("elephant", "hippo", Integer.MAX_VALUE)); - assertEquals(7, - (int) distance.compare("hippo", "elephant", Integer.MAX_VALUE)); - assertEquals(8, - (int) distance.compare("hippo", "zzzzzzzz", Integer.MAX_VALUE)); - assertEquals(8, - (int) distance.compare("zzzzzzzz", "hippo", Integer.MAX_VALUE)); - assertEquals(1, - (int) distance.compare("hello", "hallo", Integer.MAX_VALUE)); + UNLIMITED_DISTANCE.compare(null, "a"); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_NullStringInt() throws Exception { - distance.compare(null, "a", 0); + UNLIMITED_DISTANCE.compare(null, "a"); } @Test(expected = IllegalArgumentException.class) public void testGetLevenshteinDistance_StringNullInt() throws Exception { - distance.compare("a", null, 0); + UNLIMITED_DISTANCE.compare("a", null); } @Test(expected = IllegalArgumentException.class) - public void testGetLevenshteinDistance_StringStringNegativeInt() - throws Exception { - distance.compare("a", "a", -1); + public void testConstructorWithNegativeThreshold() throws Exception { + LevenshteinDistance distance = new LevenshteinDistance(-1); } } http://git-wip-us.apache.org/repos/asf/commons-text/blob/75cdc00a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java new file mode 100644 index 0000000..c6fd116 --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/ParameterizedLevenshteinDistanceTest.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.hamcrest.core.IsEqual.equalTo; +import static org.junit.Assert.assertThat; + +import java.util.Arrays; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +/** + * Unit tests for {@link org.apache.commons.text.similarity.LevenshteinDistance}. + */ +@RunWith(Parameterized.class) +public class ParameterizedLevenshteinDistanceTest { + + private final Integer distance; + private final CharSequence left; + private final CharSequence right; + private final Integer threshold; + + public ParameterizedLevenshteinDistanceTest( + final Integer threshold, + final CharSequence left, final CharSequence right, + final Integer distance) { + + this.threshold = threshold; + this.left = left; + this.right = right; + this.distance = distance; + } + + @Parameters + public static Iterable<Object[]> parameters() { + return Arrays.asList( new Object[][] { + + /* empty strings */ + { 0, "", "", 0 }, + { 8, "aaapppp", "", 7 }, + { 7, "aaapppp", "", 7 }, + { 6, "aaapppp", "", -1 }, + + /* unequal strings, zero threshold */ + { 0, "b", "a", -1 }, + { 0, "a", "b", -1 }, + + /* equal strings */ + { 0, "aa", "aa", 0 }, + { 2, "aa", "aa", 0 }, + + /* same length */ + { 2, "aaa", "bbb", -1 }, + { 3, "aaa", "bbb", 3 }, + + /* big stripe */ + { 10, "aaaaaa", "b", 6 }, + + /* distance less than threshold */ + { 8, "aaapppp", "b", 7 }, + { 4, "a", "bbb", 3 }, + + /* distance equal to threshold */ + { 7, "aaapppp", "b", 7 }, + { 3, "a", "bbb", 3 }, + + /* distance greater than threshold */ + { 2, "a", "bbb", -1 }, + { 2, "bbb", "a", -1 }, + { 6, "aaapppp", "b", -1 }, + + /* stripe runs off array, strings not similar */ + { 1, "a", "bbb", -1 }, + { 1, "bbb", "a", -1 }, + + /* stripe runs off array, strings are similar */ + { 1, "12345", "1234567", -1 }, + { 1, "1234567", "12345", -1 }, + + /* old getLevenshteinDistance test cases */ + { 1, "frog", "fog", 1 }, + { 3, "fly", "ant", 3 }, + { 7, "elephant", "hippo", 7 }, + { 6, "elephant", "hippo", -1 }, + { 7, "hippo", "elephant", 7 }, + { 6, "hippo", "elephant", -1 }, + { 8, "hippo", "zzzzzzzz", 8 }, + { 8, "zzzzzzzz", "hippo", 8 }, + { 1, "hello", "hallo", 1 }, + + { Integer.MAX_VALUE, "frog", "fog", 1 }, + { Integer.MAX_VALUE, "fly", "ant", 3 }, + { Integer.MAX_VALUE, "elephant", "hippo", 7 }, + { Integer.MAX_VALUE, "hippo", "elephant", 7 }, + { Integer.MAX_VALUE, "hippo", "zzzzzzzz", 8 }, + { Integer.MAX_VALUE, "zzzzzzzz", "hippo", 8 }, + { Integer.MAX_VALUE, "hello", "hallo", 1 } + + } ); + } + + @Test + public void test() { + LevenshteinDistance metric = new LevenshteinDistance(threshold); + assertThat(metric.compare(left, right), equalTo(distance)); + } + +}