XenoAmess commented on a change in pull request #233: URL: https://github.com/apache/commons-text/pull/233#discussion_r639807030
########## File path: src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java ########## @@ -58,7 +58,48 @@ public Integer apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } - return longestCommonSubsequence(left, right).length(); + // Find lengths of two strings + final int leftSz = left.length(); + final int rightSz = right.length(); + + // Check if we can save even more space + if (leftSz < rightSz) { + return calculateLCSLength(right, rightSz, left, leftSz); + } + return calculateLCSLength(left, leftSz, right, rightSz); + } + + // Assuming the sequence "left" is of size m and the sequence "right" is of size n, + // this method calculates the length of LCS the two sequences in O(m*n) time and O(n) space. + // Since LCS(S1, S2) = LCS(S2, S1), it is preferable to pass shorter sequence as "right," + // so that we can save even more space. + // This is an implementation of Hirschberg's algorithm: + // D. S. Hirschberg, "A Linear Space Algorithm for Computing Maximal Common Subsequences," Communications of the ACM, vol. 18, no. 6, pp. 341--343, 1975 + static int calculateLCSLength(final CharSequence left, final int m, + final CharSequence right, final int n) { + // We only need to keep track of last two rows + final int[][] dp = new int[2][1 + n]; Review comment: ``` if(n == 0){ return 0 } ``` run jmh to see if this worthy to add before this line. ########## File path: src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java ########## @@ -58,7 +58,48 @@ public Integer apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } - return longestCommonSubsequence(left, right).length(); + // Find lengths of two strings + final int leftSz = left.length(); + final int rightSz = right.length(); + + // Check if we can save even more space + if (leftSz < rightSz) { + return calculateLCSLength(right, rightSz, left, leftSz); + } + return calculateLCSLength(left, leftSz, right, rightSz); + } + + // Assuming the sequence "left" is of size m and the sequence "right" is of size n, + // this method calculates the length of LCS the two sequences in O(m*n) time and O(n) space. + // Since LCS(S1, S2) = LCS(S2, S1), it is preferable to pass shorter sequence as "right," + // so that we can save even more space. + // This is an implementation of Hirschberg's algorithm: + // D. S. Hirschberg, "A Linear Space Algorithm for Computing Maximal Common Subsequences," Communications of the ACM, vol. 18, no. 6, pp. 341--343, 1975 + static int calculateLCSLength(final CharSequence left, final int m, + final CharSequence right, final int n) { + // We only need to keep track of last two rows + final int[][] dp = new int[2][1 + n]; + + for (int i = 0; i <= m; i++) { + // We avoid calling virtual function charAt within nested loop + // for the sake of efficiency + final int leftCh = i > 0 ? left.charAt(i - 1) : -1; + // Binary index, used to index the current row + // The previous row can be calculated by subtracting currentRow from 1 + final int currentRow = i % 2; Review comment: ``` previousRow = currentRow; currentRow = 1 - currentRow; ``` no need to use % ########## File path: src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java ########## @@ -58,7 +58,48 @@ public Integer apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } - return longestCommonSubsequence(left, right).length(); + // Find lengths of two strings + final int leftSz = left.length(); + final int rightSz = right.length(); + + // Check if we can save even more space + if (leftSz < rightSz) { + return calculateLCSLength(right, rightSz, left, leftSz); + } + return calculateLCSLength(left, leftSz, right, rightSz); + } + + // Assuming the sequence "left" is of size m and the sequence "right" is of size n, + // this method calculates the length of LCS the two sequences in O(m*n) time and O(n) space. + // Since LCS(S1, S2) = LCS(S2, S1), it is preferable to pass shorter sequence as "right," + // so that we can save even more space. + // This is an implementation of Hirschberg's algorithm: + // D. S. Hirschberg, "A Linear Space Algorithm for Computing Maximal Common Subsequences," Communications of the ACM, vol. 18, no. 6, pp. 341--343, 1975 + static int calculateLCSLength(final CharSequence left, final int m, + final CharSequence right, final int n) { + // We only need to keep track of last two rows + final int[][] dp = new int[2][1 + n]; + + for (int i = 0; i <= m; i++) { + // We avoid calling virtual function charAt within nested loop + // for the sake of efficiency + final int leftCh = i > 0 ? left.charAt(i - 1) : -1; + // Binary index, used to index the current row + // The previous row can be calculated by subtracting currentRow from 1 + final int currentRow = i % 2; + final int previousRow = 1 - currentRow; + for (int j = 0; j <= n; j++) { Review comment: ``` final int[] dpTemp = dpPreviousRow dpPreviousRow=dpCurrentRow; dpCurrentRow=dpTemp; ``` run a jmh to see whether it be worthy to do this. ########## File path: src/main/java/org/apache/commons/text/similarity/LongestCommonSubsequence.java ########## @@ -58,7 +58,48 @@ public Integer apply(final CharSequence left, final CharSequence right) { if (left == null || right == null) { throw new IllegalArgumentException("Inputs must not be null"); } - return longestCommonSubsequence(left, right).length(); + // Find lengths of two strings + final int leftSz = left.length(); + final int rightSz = right.length(); + + // Check if we can save even more space + if (leftSz < rightSz) { + return calculateLCSLength(right, rightSz, left, leftSz); + } + return calculateLCSLength(left, leftSz, right, rightSz); + } + + // Assuming the sequence "left" is of size m and the sequence "right" is of size n, + // this method calculates the length of LCS the two sequences in O(m*n) time and O(n) space. + // Since LCS(S1, S2) = LCS(S2, S1), it is preferable to pass shorter sequence as "right," + // so that we can save even more space. + // This is an implementation of Hirschberg's algorithm: + // D. S. Hirschberg, "A Linear Space Algorithm for Computing Maximal Common Subsequences," Communications of the ACM, vol. 18, no. 6, pp. 341--343, 1975 + static int calculateLCSLength(final CharSequence left, final int m, + final CharSequence right, final int n) { + // We only need to keep track of last two rows + final int[][] dp = new int[2][1 + n]; + + for (int i = 0; i <= m; i++) { + // We avoid calling virtual function charAt within nested loop + // for the sake of efficiency + final int leftCh = i > 0 ? left.charAt(i - 1) : -1; + // Binary index, used to index the current row + // The previous row can be calculated by subtracting currentRow from 1 + final int currentRow = i % 2; + final int previousRow = 1 - currentRow; + for (int j = 0; j <= n; j++) { + if (i == 0 || j == 0) { + dp[currentRow][j] = 0; + } else if (leftCh == right.charAt(j - 1)) { + dp[currentRow][j] = dp[previousRow][j - 1] + 1; + } else { + dp[currentRow][j] = Math.max(dp[previousRow][j], dp[currentRow][j - 1]); + } + } + } + // Length of LCS is located at the last filled element of the dp array + return dp[m % 2][n]; Review comment: also no need to use % -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org