This is an automated email from the ASF dual-hosted git repository. ggregory pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit a28b9fa39bf83b6e1499493210eb8288a35dba1b Author: Gary D. Gregory <[email protected]> AuthorDate: Sun Jul 20 10:01:03 2025 -0400 Javadoc --- .../similarity/LevenshteinDetailedDistance.java | 275 +++++++++------------ 1 file changed, 110 insertions(+), 165 deletions(-) diff --git a/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java index 0e241efd..78505991 100644 --- a/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java +++ b/src/main/java/org/apache/commons/text/similarity/LevenshteinDetailedDistance.java @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.commons.text.similarity; import java.util.Arrays; @@ -22,9 +23,8 @@ import java.util.Arrays; * An algorithm for measuring the difference between two character sequences. * * <p> - * This is the number of changes needed to change one sequence into another, - * where each change is a single character modification (deletion, insertion - * or substitution). + * This is the number of changes needed to change one sequence into another, where each change is a single character modification (deletion, insertion or + * substitution). * </p> * * @since 1.0 @@ -37,39 +37,29 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu private static final LevenshteinDetailedDistance INSTANCE = new LevenshteinDetailedDistance(); /** - * Finds count for each of the three [insert, delete, substitute] operations - * needed. This is based on the matrix formed based on the two character - * sequence. + * Finds count for each of the three [insert, delete, substitute] operations needed. This is based on the matrix formed based on the two character sequence. * - * @param <E> The type of similarity score unit. - * @param left character sequence which need to be converted from - * @param right character sequence which need to be converted to - * @param matrix two dimensional array containing - * @param swapped tells whether the value for left character sequence and right - * character sequence were swapped to save memory - * @return result object containing the count of insert, delete and substitute and total count needed + * @param <E> The type of similarity score unit. + * @param left character sequence which need to be converted from. + * @param right character sequence which need to be converted to. + * @param matrix two dimensional array containing. + * @param swapped tells whether the value for left character sequence and right character sequence were swapped to save memory. + * @return result object containing the count of insert, delete and substitute and total count needed. */ - private static <E> LevenshteinResults findDetailedResults(final SimilarityInput<E> left, - final SimilarityInput<E> right, - final int[][] matrix, - final boolean swapped) { - + private static <E> LevenshteinResults findDetailedResults(final SimilarityInput<E> left, final SimilarityInput<E> right, final int[][] matrix, + final boolean swapped) { int delCount = 0; int addCount = 0; int subCount = 0; - int rowIndex = right.length(); int columnIndex = left.length(); - int dataAtLeft = 0; int dataAtTop = 0; int dataAtDiagonal = 0; int data = 0; boolean deleted = false; boolean added = false; - while (rowIndex >= 0 && columnIndex >= 0) { - if (columnIndex == 0) { dataAtLeft = -1; } else { @@ -89,7 +79,6 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu break; } data = matrix[rowIndex][columnIndex]; - // case in which the character at left and right are the same, // in this case none of the counters will be incremented. if (columnIndex > 0 && rowIndex > 0 && left.at(columnIndex - 1).equals(right.at(rowIndex - 1))) { @@ -97,12 +86,10 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu rowIndex--; continue; } - // handling insert and delete cases. deleted = false; added = false; - if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop - || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD + if (data - 1 == dataAtLeft && data <= dataAtDiagonal && data <= dataAtTop || dataAtDiagonal == -1 && dataAtTop == -1) { // NOPMD columnIndex--; if (swapped) { addCount++; @@ -111,8 +98,7 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu delCount++; deleted = true; } - } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft - || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD + } else if (data - 1 == dataAtTop && data <= dataAtDiagonal && data <= dataAtLeft || dataAtDiagonal == -1 && dataAtLeft == -1) { // NOPMD rowIndex--; if (swapped) { delCount++; @@ -122,7 +108,6 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu added = true; } } - // substituted case if (!added && !deleted) { subCount++; @@ -143,15 +128,11 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu } /** - * Finds the Levenshtein distance between two CharSequences if it's less than or - * equal to a given threshold. + * Finds the Levenshtein distance between two CharSequences if it's less than or equal to a given threshold. * * <p> - * This implementation follows from Algorithms on Strings, Trees and - * Sequences by Dan Gusfield and Chas Emerick's implementation of the - * Levenshtein distance algorithm from <a - * href="http://www.merriampark.com/ld.htm" - * >http://www.merriampark.com/ld.htm</a> + * This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield and Chas Emerick's implementation of the Levenshtein distance + * algorithm from <a href="http://www.merriampark.com/ld.htm" >http://www.merriampark.com/ld.htm</a> * </p> * * <pre> @@ -168,73 +149,46 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu * limitedCompare("hippo", "elephant", 6) = -1 * </pre> * - * @param <E> The type of similarity score unit. - * @param left the first CharSequence, must not be null - * @param right the second CharSequence, must not be null - * @param threshold the target threshold, must not be negative - * @return result distance, or -1 + * @param <E> The type of similarity score unit. + * @param left the first CharSequence, must not be null. + * @param right the second CharSequence, must not be null. + * @param threshold the target threshold, must not be negative. + * @return result distance, or -1. */ - private static <E> LevenshteinResults limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) { //NOPMD + private static <E> LevenshteinResults limitedCompare(SimilarityInput<E> left, SimilarityInput<E> right, final int threshold) { // NOPMD if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } if (threshold < 0) { throw new IllegalArgumentException("Threshold must not be negative"); } - /* - * This implementation only computes the distance if it's less than or - * equal to the threshold value, returning -1 if it's greater. The - * advantage is performance: unbounded distance is O(nm), but a bound of - * k allows us to reduce it to O(km) time by only computing a diagonal - * stripe of width 2k + 1 of the cost table. It is also possible to use - * this to compute the unbounded Levenshtein distance by starting the - * threshold at 1 and doubling each time until the distance is found; - * this is O(dm), where d is the distance. + * This implementation only computes the distance if it's less than or equal to the threshold value, returning -1 if it's greater. The advantage is + * performance: unbounded distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only computing a diagonal stripe of width 2k + 1 + * of the cost table. It is also possible to use this to compute the unbounded Levenshtein distance by starting the threshold at 1 and doubling each + * time until the distance is found; this is O(dm), where d is the distance. * - * One subtlety comes from needing to ignore entries on the border of - * our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry - * to the left of the leftmost member We must ignore the entry above the - * rightmost member + * One subtlety comes from needing to ignore entries on the border of our stripe eg. p[] = |#|#|#|* d[] = *|#|#|#| We must ignore the entry to the left + * of the leftmost member We must ignore the entry above the rightmost member * - * Another subtlety comes from our stripe running off the matrix if the - * strings aren't of the same size. Since string s is always swapped to - * be the shorter of the two, the stripe will always run off to the - * upper right instead of the lower left of the matrix. + * Another subtlety comes from our stripe running off the matrix if the strings aren't of the same size. Since string s is always swapped to be the + * shorter of the two, the stripe will always run off to the upper right instead of the lower left of the matrix. * - * As a concrete example, suppose s is of length 5, t is of length 7, - * and our threshold is 1. In this case we're going to walk a stripe of - * length 3. The matrix would look like so: + * As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1. In this case we're going to walk a stripe of length 3. The + * matrix would look like so: * - * <pre> - * 1 2 3 4 5 - * 1 |#|#| | | | - * 2 |#|#|#| | | - * 3 | |#|#|#| | - * 4 | | |#|#|#| - * 5 | | | |#|#| - * 6 | | | | |#| - * 7 | | | | | | - * </pre> + * <pre> 1 2 3 4 5 1 |#|#| | | | 2 |#|#|#| | | 3 | |#|#|#| | 4 | | |#|#|#| 5 | | | |#|#| 6 | | | | |#| 7 | | | | | | </pre> * - * Note how the stripe leads off the table as there is no possible way - * to turn a string of length 5 into one of length 7 in edit distance of - * 1. + * Note how the stripe leads off the table as there is no possible way to turn a string of length 5 into one of length 7 in edit distance of 1. * - * Additionally, this implementation decreases memory usage by using two - * single-dimensional arrays and swapping them back and forth instead of - * allocating an entire n by m matrix. This requires a few minor - * changes, such as immediately returning when it's detected that the - * stripe has run off the matrix and initially filling the arrays with - * large values so that entries we don't compute are ignored. + * Additionally, this implementation decreases memory usage by using two single-dimensional arrays and swapping them back and forth instead of + * allocating an entire n by m matrix. This requires a few minor changes, such as immediately returning when it's detected that the stripe has run off + * the matrix and initially filling the arrays with large values so that entries we don't compute are ignored. * - * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for - * some discussion. + * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion. */ - int n = left.length(); // length of left int m = right.length(); // length of right - // if one string is empty, the edit distance is necessarily the length of the other if (n == 0) { return m <= threshold ? new LevenshteinResults(m, m, 0, 0) : new LevenshteinResults(-1, 0, 0, 0); @@ -242,7 +196,6 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu if (m == 0) { return n <= threshold ? new LevenshteinResults(n, 0, n, 0) : new LevenshteinResults(-1, 0, 0, 0); } - boolean swapped = false; if (n > m) { // swap the two strings to consume less memory @@ -253,20 +206,17 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu m = right.length(); swapped = true; } - int[] p = new int[n + 1]; // 'previous' cost array, horizontally int[] d = new int[n + 1]; // cost array, horizontally int[] tempD; // placeholder to assist in swapping p and d final int[][] matrix = new int[m + 1][n + 1]; - - //filling the first row and first column values in the matrix + // filling the first row and first column values in the matrix for (int index = 0; index <= n; index++) { matrix[0][index] = index; } for (int index = 0; index <= m; index++) { matrix[index][0] = index; } - // fill in starting table values final int boundary = Math.min(n, threshold) + 1; for (int i = 0; i < boundary; i++) { @@ -276,27 +226,21 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu // stripe will be ignored in following loop iterations Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE); Arrays.fill(d, Integer.MAX_VALUE); - // iterates through t for (int j = 1; j <= m; j++) { final E rightJ = right.at(j - 1); // jth character of right d[0] = j; - // compute stripe indices, constrain to array size final int min = Math.max(1, j - threshold); - final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min( - n, j + threshold); - + final int max = j > Integer.MAX_VALUE - threshold ? n : Math.min(n, j + threshold); // the stripe may lead off of the table if s and t are of different sizes if (min > max) { return new LevenshteinResults(-1, 0, 0, 0); } - // ignore entry left of leftmost if (min > 1) { d[min - 1] = Integer.MAX_VALUE; } - // iterates through [min, max] in s for (int i = min; i <= max; i++) { if (left.at(i - 1).equals(rightJ)) { @@ -308,13 +252,11 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu } matrix[j][i] = d[i]; } - // copy current distance counts to 'previous row' distance counts tempD = p; p = d; d = tempD; } - // if p[n] is greater than the threshold, there's no guarantee on it being the correct distance if (p[n] <= threshold) { return findDetailedResults(left, right, matrix, swapped); @@ -325,15 +267,21 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * Finds the Levenshtein distance between two Strings. * - * <p>A higher score indicates a greater distance.</p> + * <p> + * A higher score indicates a greater distance. + * </p> * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * <p> + * The previous implementation of the Levenshtein distance algorithm was from + * <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a> + * </p> * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * <p> + * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large + * strings.<br> + * This implementation of the Levenshtein distance algorithm is from + * <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a> + * </p> * * <pre> * unlimitedCompare(null, *) = IllegalArgumentException @@ -349,37 +297,29 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu * unlimitedCompare("hello", "hallo") = 1 * </pre> * - * @param <E> The type of similarity score unit. - * @param left the first CharSequence, must not be null - * @param right the second CharSequence, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either CharSequence input is {@code null} + * @param <E> The type of similarity score unit. + * @param left the first CharSequence, must not be null. + * @param right the second CharSequence, must not be null. + * @return result distance, or -1. + * @throws IllegalArgumentException if either CharSequence input is {@code null}. */ private static <E> LevenshteinResults unlimitedCompare(SimilarityInput<E> left, SimilarityInput<E> right) { if (left == null || right == null) { throw new IllegalArgumentException("CharSequences must not be null"); } - /* - The difference between this impl. and the previous is that, rather - than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, - we maintain two single-dimensional arrays of length s.length() + 1. The first, d, - is the 'current working' distance array that maintains the newest distance cost - counts as we iterate through the characters of String s. Each time we increment - the index of String t we are comparing, d is copied to p, the second int[]. Doing so - allows us to retain the previous cost counts as required by the algorithm (taking - the minimum of the cost count to the left, up one, and diagonally up and to the left - of the current cost count being calculated). (Note that the arrays aren't really - copied anymore, just switched...this is clearly much better than cloning an array - or doing a System.arraycopy() each time through the outer loop.) - - Effectively, the difference between the two implementations is this one does not - cause an out of memory condition when calculating the LD over two very large strings. + * The difference between this impl. and the previous is that, rather than creating and retaining a matrix of size s.length() + 1 by t.length() + 1, we + * maintain two single-dimensional arrays of length s.length() + 1. The first, d, is the 'current working' distance array that maintains the newest + * distance cost counts as we iterate through the characters of String s. Each time we increment the index of String t we are comparing, d is copied to + * p, the second int[]. Doing so allows us to retain the previous cost counts as required by the algorithm (taking the minimum of the cost count to the + * left, up one, and diagonally up and to the left of the current cost count being calculated). (Note that the arrays aren't really copied anymore, just + * switched...this is clearly much better than cloning an array or doing a System.arraycopy() each time through the outer loop.) + * + * Effectively, the difference between the two implementations is this one does not cause an out of memory condition when calculating the LD over two + * very large strings. */ - int n = left.length(); // length of left int m = right.length(); // length of right - if (n == 0) { return new LevenshteinResults(m, m, 0, 0); } @@ -396,12 +336,10 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu m = right.length(); swapped = true; } - int[] p = new int[n + 1]; // 'previous' cost array, horizontally int[] d = new int[n + 1]; // cost array, horizontally - int[] tempD; //placeholder to assist in swapping p and d + int[] tempD; // placeholder to assist in swapping p and d final int[][] matrix = new int[m + 1][n + 1]; - // filling the first row and first column values in the matrix for (int index = 0; index <= n; index++) { matrix[0][index] = index; @@ -409,30 +347,24 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu for (int index = 0; index <= m; index++) { matrix[index][0] = index; } - // indexes into strings left and right int i; // iterates through left int j; // iterates through right - E rightJ; // jth character of right - int cost; // cost for (i = 0; i <= n; i++) { p[i] = i; } - for (j = 1; j <= m; j++) { rightJ = right.at(j - 1); d[0] = j; - for (i = 1; i <= n; i++) { cost = left.at(i - 1).equals(rightJ) ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i] = Math.min(Math.min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost); - //filling the matrix + // filling the matrix matrix[j][i] = d[i]; } - // copy current distance counts to 'previous row' distance counts tempD = p; p = d; @@ -448,8 +380,7 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * <p> - * This returns the default instance that uses a version - * of the algorithm that does not use a threshold parameter. + * This returns the default instance that uses a version of the algorithm that does not use a threshold parameter. * </p> * * @see LevenshteinDetailedDistance#getDefaultInstance() @@ -463,7 +394,9 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * If the threshold is not null, distance calculations will be limited to a maximum length. * - * <p>If the threshold is null, the unlimited version of the algorithm will be used.</p> + * <p> + * If the threshold is null, the unlimited version of the algorithm will be used. + * </p> * * @param threshold If this is null then distances calculations will not be limited. This may not be negative. */ @@ -477,15 +410,21 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * Computes the Levenshtein distance between two Strings. * - * <p>A higher score indicates a greater distance.</p> + * <p> + * A higher score indicates a greater distance. + * </p> * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * <p> + * The previous implementation of the Levenshtein distance algorithm was from + * <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a> + * </p> * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * <p> + * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large + * strings.<br> + * This implementation of the Levenshtein distance algorithm is from + * <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a> + * </p> * * <pre> * distance.apply(null, *) = IllegalArgumentException @@ -501,10 +440,10 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu * distance.apply("hello", "hallo") = 1 * </pre> * - * @param left the first input, must not be null - * @param right the second input, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} + * @param left the first input, must not be null. + * @param right the second input, must not be null. + * @return result distance, or -1. + * @throws IllegalArgumentException if either String input {@code null}. */ @Override public LevenshteinResults apply(final CharSequence left, final CharSequence right) { @@ -514,15 +453,21 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * Computes the Levenshtein distance between two Strings. * - * <p>A higher score indicates a greater distance.</p> + * <p> + * A higher score indicates a greater distance. + * </p> * - * <p>The previous implementation of the Levenshtein distance algorithm - * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * <p> + * The previous implementation of the Levenshtein distance algorithm was from + * <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a> + * </p> * - * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError - * which can occur when my Java implementation is used with very large strings.<br> - * This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> + * <p> + * Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError which can occur when my Java implementation is used with very large + * strings.<br> + * This implementation of the Levenshtein distance algorithm is from + * <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a> + * </p> * * <pre> * distance.apply(null, *) = IllegalArgumentException @@ -538,11 +483,11 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu * distance.apply("hello", "hallo") = 1 * </pre> * - * @param <E> The type of similarity score unit. - * @param left the first input, must not be null - * @param right the second input, must not be null - * @return result distance, or -1 - * @throws IllegalArgumentException if either String input {@code null} + * @param <E> The type of similarity score unit. + * @param left the first input, must not be null. + * @param right the second input, must not be null. + * @return result distance, or -1. + * @throws IllegalArgumentException if either String input {@code null}. * @since 1.13.0 */ public <E> LevenshteinResults apply(final SimilarityInput<E> left, final SimilarityInput<E> right) { @@ -555,7 +500,7 @@ public class LevenshteinDetailedDistance implements EditDistance<LevenshteinResu /** * Gets the distance threshold. * - * @return The distance threshold + * @return The distance threshold. */ public Integer getThreshold() { return threshold;
