Repository: commons-text Updated Branches: refs/heads/master 4168de224 -> 0622f5a80
TEXT-10 A more complex Levenshtein distance Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/02168c1f Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/02168c1f Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/02168c1f Branch: refs/heads/master Commit: 02168c1f6c8e8c5d231203831eccea591de4c2a4 Parents: ad2c0bf Author: drajakumar <drajaku...@paypal.com> Authored: Fri Oct 14 17:48:12 2016 +0530 Committer: drajakumar <drajaku...@paypal.com> Committed: Fri Oct 14 17:48:12 2016 +0530 ---------------------------------------------------------------------- .../similarity/LevenshteinDetailedDistance.java | 530 +++++++++++++++++++ .../text/similarity/LevenshteinResults.java | 76 +++ .../LevenshteinDetailedDistanceTest.java | 354 +++++++++++++ 3 files changed, 960 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-text/blob/02168c1f/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java new file mode 100644 index 0000000..8511058 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java @@ -0,0 +1,530 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import java.util.Arrays; + +/** + * An algorithm for measuring the difference between two character sequences. + * + * <p> + * This is the number of changes needed to change one sequence into another, + * where each change is a single character modification (deletion, insertion + * or substitution). + * </p> + * + */ +public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResults> { + + + /** + * Default instance. + */ + private static final LevenshteinDetailedDistance DEFAULT_INSTANCE = new LevenshteinDetailedDistance(); + + /** + * Threshold. + */ + private final Integer threshold; + + /** + * <p> + * This returns the default instance that uses a version + * of the algorithm that does not use a threshold parameter. + * </p> + * + * @see LevenshteinDetailedDistance#getDefaultInstance() + */ + public LevenshteinDetailedDistance() { + this(null); + } + + /** + * <p> + * If the threshold is not null, distance calculations will be limited to a maximum length. + * If the threshold is null, the unlimited version of the algorithm will be used. + * </p> + * + * @param threshold + * If this is null then distances calculations will not be limited. + * This may not be negative. + */ + public LevenshteinDetailedDistance(final Integer threshold) { + if (threshold != null && threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + this.threshold = threshold; + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * distance.apply(null, *) = IllegalArgumentException + * distance.apply(*, null) = IllegalArgumentException + * distance.apply("","") = 0 + * distance.apply("","a") = 1 + * distance.apply("aaapppp", "") = 7 + * distance.apply("frog", "fog") = 1 + * distance.apply("fly", "ant") = 3 + * distance.apply("elephant", "hippo") = 7 + * distance.apply("hippo", "elephant") = 7 + * distance.apply("hippo", "zzzzzzzz") = 8 + * distance.apply("hello", "hallo") = 1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + public LevenshteinResults apply(CharSequence left, CharSequence right) { + if (threshold != null) { + return limitedCompare(left, right, threshold); + } else { + return unlimitedCompare(left, right); + } + } + + /** + * Gets the default instance. + * + * @return the default instace + */ + public static LevenshteinDetailedDistance getDefaultInstance() { + return DEFAULT_INSTANCE; + } + + /** + * Gets the distance threshold. + * + * @return the distance threshold + */ + public Integer getThreshold() { + return threshold; + } + + /** + * Find the Levenshtein distance between two CharSequences if it's less than or + * equal to a given threshold. + * + * <p> + * This implementation follows from Algorithms on Strings, Trees and + * Sequences by Dan Gusfield and Chas Emerick's implementation of the + * Levenshtein distance algorithm from <a + * href="http://www.merriampark.com/ld.htm" + * >http://www.merriampark.com/ld.htm</a> + * </p> + * + * <pre> + * limitedCompare(null, *, *) = IllegalArgumentException + * limitedCompare(*, null, *) = IllegalArgumentException + * limitedCompare(*, *, -1) = IllegalArgumentException + * limitedCompare("","", 0) = 0 + * limitedCompare("aaapppp", "", 8) = 7 + * limitedCompare("aaapppp", "", 7) = 7 + * limitedCompare("aaapppp", "", 6)) = -1 + * limitedCompare("elephant", "hippo", 7) = 7 + * limitedCompare("elephant", "hippo", 6) = -1 + * limitedCompare("hippo", "elephant", 7) = 7 + * limitedCompare("hippo", "elephant", 6) = -1 + * </pre> + * + * @param left the first string, must not be null + * @param right the second string, must not be null + * @param threshold the target threshold, must not be negative + * @return result distance, or -1 + */ + private static LevenshteinResults limitedCompare(CharSequence left, CharSequence right, int threshold) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + if (threshold < 0) { + throw new IllegalArgumentException("Threshold must not be negative"); + } + + /* + * This implementation only computes the distance if it's less than or + * equal to the threshold value, returning -1 if it's greater. The + * advantage is performance: unbounded distance is O(nm), but a bound of + * k allows us to reduce it to O(km) time by only computing a diagonal + * stripe of width 2k + 1 of the cost table. It is also possible to use + * this to compute the unbounded Levenshtein distance by starting the + * threshold at 1 and doubling each time until the distance is found; + * this is O(dm), where d is the distance. + * + * One subtlety comes from needing to ignore entries on the border of + * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry + * to the left of the leftmost member We must ignore the entry above the + * rightmost member + * + * Another subtlety comes from our stripe running off the matrix if the + * strings aren't of the same size. Since string s is always swapped to + * be the shorter of the two, the stripe will always run off to the + * upper right instead of the lower left of the matrix. + * + * As a concrete example, suppose s is of length 5, t is of length 7, + * and our threshold is 1. In this case we're going to walk a stripe of + * length 3. The matrix would look like so: + * + * <pre> + * 1 2 3 4 5 + * 1 |#|#| | | | + * 2 |#|#|#| | | + * 3 | |#|#|#| | + * 4 | | |#|#|#| + * 5 | | | |#|#| + * 6 | | | | |#| + * 7 | | | | | | + * </pre> + * + * Note how the stripe leads off the table as there is no possible way + * to turn a string of length 5 into one of length 7 in edit distance of + * 1. + * + * Additionally, this implementation decreases memory usage by using two + * single-dimensional arrays and swapping them back and forth instead of + * allocating an entire n by m matrix. This requires a few minor + * changes, such as immediately returning when it's detected that the + * stripe has run off the matrix and initially filling the arrays with + * large values so that entries we don't compute are ignored. + * + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for + * some discussion. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + // if one string is empty, the edit distance is necessarily the length + // of the other + if (n == 0) { + return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); + } else if (m == 0) { + return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); + } + + boolean swapped = false; + if (n > m) { + // swap the two strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + swapped = true; + } + + int[] p = new int[n + 1]; // 'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; // placeholder to assist in swapping p and d + int[][] matrix = new int[m+1][n+1]; + + //filling the first row and first column values in the matrix + for(int index = 0; index <=n; index++) { + matrix[0][index] = index; + } + for(int index = 0; index <=m; index++) { + matrix[index][0] = index; + } + + // fill in starting table values + final int boundary = Math.min(n, threshold) + 1; + for (int i = 0; i < boundary; i++) { + p[i] = i; + } + // these fills ensure that the value above the rightmost entry of our + // stripe will be ignored in following loop iterations + Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); + Arrays.fill(d, Integer.MAX_VALUE); + + // iterates through t + for (int j = 1; j <= m; j++) { + final char rightJ = right.charAt(j - 1); // jth character of right + d[0] = j; + + // compute stripe indices, constrain to array size + final int min = Math.max(1, j - threshold); + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( + n, j + threshold); + + // the stripe may lead off of the table if s and t are of different + // sizes + if (min > max) { + return new LevenshteinResults(-1, 0, 0, 0); + } + + // ignore entry left of leftmost + if (min > 1) { + d[min - 1] = Integer.MAX_VALUE; + } + + // iterates through [min, max] in s + for (int i = min; i <= max; i++) { + if (left.charAt(i - 1) == rightJ) { + // diagonally left and up + d[i] = p[i - 1]; + } else { + // 1 + minimum of cell to the left, to the top, diagonally + // left and up + d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]); + } + matrix[j][i] = d[i]; + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + + // if p[n] is greater than the threshold, there's no guarantee on it + // being the correct + // distance + if (p[n] <= threshold) { + return findDetailedResults(left, right, matrix, swapped); + } + return new LevenshteinResults(-1, 0, 0, 0); + } + + /** + * <p>Find the Levenshtein distance between two Strings.</p> + * + * <p>A higher score indicates a greater distance.</p> + * + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * + * <pre> + * unlimitedCompare(null, *) = IllegalArgumentException + * unlimitedCompare(*, null) = IllegalArgumentException + * unlimitedCompare("","") = 0 + * unlimitedCompare("","a") = 1 + * unlimitedCompare("aaapppp", "") = 7 + * unlimitedCompare("frog", "fog") = 1 + * unlimitedCompare("fly", "ant") = 3 + * unlimitedCompare("elephant", "hippo") = 7 + * unlimitedCompare("hippo", "elephant") = 7 + * unlimitedCompare("hippo", "zzzzzzzz") = 8 + * unlimitedCompare("hello", "hallo") = 1 + * </pre> + * + * @param left the first String, must not be null + * @param right the second String, must not be null + * @return result distance, or -1 + * @throws IllegalArgumentException if either String input {@code null} + */ + private static LevenshteinResults unlimitedCompare(CharSequence left, CharSequence right) { + if (left == null || right == null) { + throw new IllegalArgumentException("Strings must not be null"); + } + + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, + we maintain two single-dimensional arrays of length s.length() + 1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = left.length(); // length of left + int m = right.length(); // length of right + + if (n == 0) { + return new LevenshteinResults(m, m, 0, 0); + } else if (m == 0) { + return new LevenshteinResults(n, 0, n, 0); + } + boolean swapped = false; + if (n > m) { + // swap the input strings to consume less memory + final CharSequence tmp = left; + left = right; + right = tmp; + n = m; + m = right.length(); + swapped = true; + } + + int[] p = new int[n + 1]; //'previous' cost array, horizontally + int[] d = new int[n + 1]; // cost array, horizontally + int[] tempD; //placeholder to assist in swapping p and d + int[][] matrix = new int[m+1][n+1]; + + //filling the first row and first column values in the matrix + for(int index = 0; index <=n; index++) { + matrix[0][index] = index; + } + for(int index = 0; index <=m; index++) { + matrix[index][0] = index; + } + + // indexes into strings left and right + int i; // iterates through left + int j; // iterates through right + + char rightJ; // jth character of right + + int cost; // cost + for (i = 0; i <= n; i++) { + p[i] = i; + } + + for (j = 1; j <= m; j++) { + rightJ = right.charAt(j - 1); + d[0] = j; + + for (i = 1; i <= n; i++) { + cost = left.charAt(i - 1) == rightJ ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); + //filling the matrix + matrix[j][i] = d[i]; + } + + // copy current distance counts to 'previous row' distance counts + tempD = p; + p = d; + d = tempD; + } + return findDetailedResults(left, right, matrix, swapped); + } + + /** + * Finds count for each of the three [insert, delete, substitute] operations + * needed. This is based on the matrix formed based on the two character + * sequence. + * + * @param left + * character sequence which need to be converted from. + * @param right + * character sequence which need to be converted to. + * @param matrix + * two dimensional array containing + * @param swapped + * tells whether the value for left character sequence and right + * character sequence were swapped to save memory + * @return result object containing the count of insert, delete and + * substitute and total count needed + */ + private static LevenshteinResults findDetailedResults(CharSequence left, CharSequence right, int[][] matrix, + boolean swapped) { + + int delCount = 0; + int addCount = 0; + int subCount = 0; + + int rowIndex = right.length(); + int columnIndex = left.length(); + + int dataAtLeft = 0; + int dataAtTop = 0; + int dataAtDiagonal = 0; + int data = 0; + boolean deleted = false; + boolean added = false; + + while (rowIndex >= 0 && columnIndex >= 0) { + + if (columnIndex == 0) { + dataAtLeft = -1; + } else { + dataAtLeft = matrix[rowIndex][columnIndex - 1]; + } + if (rowIndex == 0) { + dataAtTop = -1; + } else { + dataAtTop = matrix[rowIndex - 1][columnIndex]; + } + if (rowIndex > 0 && columnIndex > 0) { + dataAtDiagonal = matrix[rowIndex - 1][columnIndex - 1]; + } else { + dataAtDiagonal = -1; + } + if (dataAtLeft == -1 && dataAtTop == -1 && dataAtDiagonal == -1) { + break; + } + data = matrix[rowIndex][columnIndex]; + + // case in which the character at left and right are the same, + // in this case none of the counters will be incremented. + if (columnIndex > 0 && rowIndex > 0 && left.charAt(columnIndex - 1) == right.charAt(rowIndex - 1)) { + columnIndex--; + rowIndex--; + continue; + } + + // handling insert and delete cases. + deleted = false; + added = false; + if (data - 1 == dataAtLeft && (data <= dataAtDiagonal && data <= dataAtTop) + || (dataAtDiagonal == -1 && dataAtTop == -1)) { + columnIndex--; + if (swapped == true) { + addCount++; + added = true; + } else { + delCount++; + deleted = true; + } + } + else if (data - 1 == dataAtTop && (data <= dataAtDiagonal && data <= dataAtLeft) + || (dataAtDiagonal == -1 && dataAtLeft == -1)) { + rowIndex--; + if (swapped == true) { + delCount++; + deleted = true; + } else { + addCount++; + added = true; + } + } + + // substituted case + if (!added && !deleted) { + subCount++; + columnIndex--; + rowIndex--; + } + } + return new LevenshteinResults(addCount + delCount + subCount, addCount, delCount, subCount); + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/02168c1f/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java new file mode 100644 index 0000000..69e7d32 --- /dev/null +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinResults.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +/** + * Container class to store Levenshtein distance between two character sequence. + * It also stores the count of inserts, deletions and substitutions needed to + * change one character sequence to other. + * + */ +public class LevenshteinResults { + + private final Integer distance; + private final Integer insertCount; + private final Integer deleteCount; + private final Integer substituteCount; + + public LevenshteinResults (final Integer distance, final Integer insertCount, final Integer deleteCount, final Integer substituteCount) { + this.distance = distance; + this.insertCount = insertCount; + this.deleteCount = deleteCount; + this.substituteCount = substituteCount; + } + + /** + * gets the distance between two character sequence. + * @return distance between two character sequence. + */ + public Integer getDistance() { + return distance; + } + + /** + * gets the number of insertion needed to change one character sequence to + * other. + * + * @return insert character count. + */ + public Integer getInsertCount() { + return insertCount; + } + + /** + * gets the number of character deletion needed to change one character + * sequence to other. + * + * @return delete character count. + */ + public Integer getDeleteCount() { + return deleteCount; + } + + /** + * get the number of character substitution needed to change one character + * sequence to other. + * + * @return substitute character count. + */ + public Integer getSubstituteCount() { + return substituteCount; + } +} http://git-wip-us.apache.org/repos/asf/commons-text/blob/02168c1f/src/test/java/org/apache/commons/text/similarity/LevenshteinDetailedDistanceTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/text/similarity/LevenshteinDetailedDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/LevenshteinDetailedDistanceTest.java new file mode 100644 index 0000000..7ac3f8e --- /dev/null +++ b/src/test/java/org/apache/commons/text/similarity/LevenshteinDetailedDistanceTest.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.text.similarity; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; + +public class LevenshteinDetailedDistanceTest { + + private static final LevenshteinDetailedDistance UNLIMITED_DISTANCE = new LevenshteinDetailedDistance(); + + @Test + public void testGetLevenshteinDetailedDistance_StringString() { + LevenshteinResults result = UNLIMITED_DISTANCE.apply("", ""); + assertEquals(0, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("", "a"); + assertEquals(1, (int) result.getDistance()); + assertEquals(1, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("aaapppp", ""); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(7, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("frog", "fog"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(1, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("fly", "ant"); + assertEquals(3, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(3, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("elephant", "hippo"); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("hippo", "elephant"); + assertEquals(7, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("hippo", "zzzzzzzz"); + assertEquals(8, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("zzzzzzzz", "hippo"); + assertEquals(8, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = UNLIMITED_DISTANCE.apply("hello", "hallo"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDetailedDistance_NullString() throws Exception { + UNLIMITED_DISTANCE.apply("a", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDetailedDistance_StringNull() throws Exception { + UNLIMITED_DISTANCE.apply(null, "a"); + } + + @Test + public void testGetLevenshteinDetailedDistance_StringStringInt() { + + LevenshteinResults result = new LevenshteinDetailedDistance(0).apply("", ""); + + assertEquals(0, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(8).apply("aaapppp", ""); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(7, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(7).apply("aaapppp", ""); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(7, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(6).apply("aaapppp", ""); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(0).apply("b", "a"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(0).apply("a", "b"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(0).apply("aa", "aa"); + assertEquals(0, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(2).apply("aa", "aa"); + assertEquals(0, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(2).apply("aaa", "bbb"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(3).apply("aaa", "bbb"); + assertEquals(3, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(3, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(10).apply("aaaaaa", "b"); + assertEquals(6, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(5, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(8).apply("aaapppp", "b"); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(6, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(4).apply("a", "bbb"); + assertEquals(3, (int) result.getDistance()); + assertEquals(2, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + + result = new LevenshteinDetailedDistance(7).apply("aaapppp", "b"); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(6, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(3).apply("a", "bbb"); + assertEquals(3, (int) result.getDistance()); + assertEquals(2, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(2).apply("a", "bbb"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(2).apply("bbb", "a"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(6).apply("aaapppp", "b"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("a", "bbb"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("bbb", "a"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("12345", "1234567"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("1234567", "12345"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("frog", "fog"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(1, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(3).apply("fly", "ant"); + assertEquals(3, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(3, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(7).apply("elephant", "hippo"); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(6).apply("elephant", "hippo"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(7).apply("hippo", "elephant"); + assertEquals(7, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(7).apply("hippo", "elephant"); + assertEquals(7, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(6).apply("hippo", "elephant"); + assertEquals(-1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(8).apply("hippo", "zzzzzzzz"); + assertEquals(8, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(8).apply("zzzzzzzz", "hippo"); + assertEquals(8, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(1).apply("hello", "hallo"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("frog", "fog"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(1, (int) result.getDeleteCount()); + assertEquals(0, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("fly", "ant"); + assertEquals(3, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(3, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("elephant", "hippo"); + assertEquals(7, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("hippo", "elephant"); + assertEquals(7, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(4, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("hippo", "zzzzzzzz"); + assertEquals(8, (int) result.getDistance()); + assertEquals(3, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("zzzzzzzz", "hippo"); + assertEquals(8, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(3, (int) result.getDeleteCount()); + assertEquals(5, (int) result.getSubstituteCount()); + + result = new LevenshteinDetailedDistance(Integer.MAX_VALUE).apply("hello", "hallo"); + assertEquals(1, (int) result.getDistance()); + assertEquals(0, (int) result.getInsertCount()); + assertEquals(0, (int) result.getDeleteCount()); + assertEquals(1, (int) result.getSubstituteCount()); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDetailedDistance_NullStringInt() throws Exception { + UNLIMITED_DISTANCE.apply(null, "a"); + } + + @Test(expected = IllegalArgumentException.class) + public void testGetLevenshteinDetailedDistance_StringNullInt() throws Exception { + UNLIMITED_DISTANCE.apply("a", null); + } + + @Test(expected = IllegalArgumentException.class) + public void testConstructorWithNegativeThreshold() throws Exception { + new LevenshteinDetailedDistance(-1); + } +}