Author: bayard Date: Mon Mar 13 22:09:27 2006 New Revision: 385745 URL: http://svn.apache.org/viewcvs?rev=385745&view=rev Log: Finally applying Chas Emerick's improved getLevenshtein implementation
Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java?rev=385745&r1=385744&r2=385745&view=diff ============================================================================== --- jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java (original) +++ jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java Mon Mar 13 22:09:27 2006 @@ -4711,8 +4711,13 @@ * another, where each change is a single character modification (deletion, * insertion or substitution).</p> * - * <p>This implementation of the Levenshtein distance algorithm - * is from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * <p>The previous implementation of the Levenshtein distance algorithm + * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p> + * + * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError + * which can occur when my Java implementation is used with very large strings.<br> + * This implementation of the Levenshtein distance algorithm + * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p> * * <pre> * StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException @@ -4737,57 +4742,68 @@ if (s == null || t == null) { throw new IllegalArgumentException("Strings must not be null"); } - int d[][]; // matrix - int n; // length of s - int m; // length of t - int i; // iterates through s - int j; // iterates through t - char s_i; // ith character of s - char t_j; // jth character of t - int cost; // cost - // Step 1 - n = s.length(); - m = t.length(); + /* + The difference between this impl. and the previous is that, rather + than creating and retaining a matrix of size s.length()+1 by t.length()+1, + we maintain two single-dimensional arrays of length s.length()+1. The first, d, + is the 'current working' distance array that maintains the newest distance cost + counts as we iterate through the characters of String s. Each time we increment + the index of String t we are comparing, d is copied to p, the second int[]. Doing so + allows us to retain the previous cost counts as required by the algorithm (taking + the minimum of the cost count to the left, up one, and diagonally up and to the left + of the current cost count being calculated). (Note that the arrays aren't really + copied anymore, just switched...this is clearly much better than cloning an array + or doing a System.arraycopy() each time through the outer loop.) + + Effectively, the difference between the two implementations is this one does not + cause an out of memory condition when calculating the LD over two very large strings. + */ + + int n = s.length(); // length of s + int m = t.length(); // length of t + if (n == 0) { return m; - } - if (m == 0) { + } else if (m == 0) { return n; } - d = new int[n + 1][m + 1]; - // Step 2 - for (i = 0; i <= n; i++) { - d[i][0] = i; - } + int p[] = new int[n+1]; //'previous' cost array, horizontally + int d[] = new int[n+1]; // cost array, horizontally + int _d[]; //placeholder to assist in swapping p and d - for (j = 0; j <= m; j++) { - d[0][j] = j; - } + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t - // Step 3 - for (i = 1; i <= n; i++) { - s_i = s.charAt(i - 1); + char t_j; // jth character of t - // Step 4 - for (j = 1; j <= m; j++) { - t_j = t.charAt(j - 1); + int cost; // cost - // Step 5 - if (s_i == t_j) { - cost = 0; - } else { - cost = 1; - } + for (i = 0; i<=n; i++) { + p[i] = i; + } - // Step 6 - d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + for (j = 1; j<=m; j++) { + t_j = t.charAt(j-1); + d[0] = j; + + for (i=1; i<=n; i++) { + cost = s.charAt(i-1)==t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); } + + // copy current distance counts to 'previous row' distance counts + _d = p; + p = d; + d = _d; } - // Step 7 - return d[n][m]; + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + return p[n]; } /** @@ -4798,6 +4814,7 @@ * @param c value 3 * @return the smallest of the values */ +/* private static int min(int a, int b, int c) { // Method copied from NumberUtils to avoid dependency on subpackage if (b < a) { @@ -4808,5 +4825,6 @@ } return a; } +*/ } --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]