vgritsenko 2004/03/20 06:35:42 Modified: docs jakarta-regexp.jar src/java/org/apache/regexp RE.java RETest.java Log: Applied patch from Oleg Sukhodolsky: reduce code duplication, add method for character comparison. Revision Changes Path 1.2 +53 -64 jakarta-regexp/docs/jakarta-regexp.jar <<Binary file>> 1.22 +121 -143 jakarta-regexp/src/java/org/apache/regexp/RE.java Index: RE.java =================================================================== RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RE.java,v retrieving revision 1.21 retrieving revision 1.22 diff -u -r1.21 -r1.22 --- RE.java 27 Feb 2004 02:41:20 -0000 1.21 +++ RE.java 20 Mar 2004 14:35:42 -0000 1.22 @@ -121,14 +121,14 @@ * [:cntrl:] Control characters. * [:digit:] Numeric characters. * [:graph:] Characters that are printable and are also visible. - * (A space is printable, but not visible, while an + * (A space is printable, but not visible, while an * `a' is both.) * [:lower:] Lower-case alphabetic characters. - * [:print:] Printable characters (characters that are not + * [:print:] Printable characters (characters that are not * control characters.) * [:punct:] Punctuation characters (characters that are not letter, * digits, control characters, or space characters). - * [:space:] Space characters (such as space, tab, and formfeed, + * [:space:] Space characters (such as space, tab, and formfeed, * to name a few). * [:upper:] Upper-case alphabetic characters. * [:xdigit:] Characters that are hexadecimal digits. @@ -181,7 +181,7 @@ * AB Matches A followed by B * A|B Matches either A or B * (A) Used for subexpression grouping - * (?:A) Used for subexpression clustering (just like grouping but + * (?:A) Used for subexpression clustering (just like grouping but * no backrefs) * * @@ -411,6 +411,7 @@ * Constructs a regular expression matcher from a String by compiling it * using a new instance of RECompiler. If you will be compiling many * expressions, you may prefer to use a single RECompiler object instead. + * * @param pattern The regular expression pattern to compile. * @exception RESyntaxException Thrown if the regular expression has invalid syntax. * @see RECompiler @@ -425,6 +426,7 @@ * Constructs a regular expression matcher from a String by compiling it * using a new instance of RECompiler. If you will be compiling many * expressions, you may prefer to use a single RECompiler object instead. + * * @param pattern The regular expression pattern to compile. * @param matchFlags The matching style * @exception RESyntaxException Thrown if the regular expression has invalid syntax. @@ -441,15 +443,14 @@ * Construct a matcher for a pre-compiled regular expression from program * (bytecode) data. Permits special flags to be passed in to modify matching * behaviour. + * * @param program Compiled regular expression program (see RECompiler and/or recompile) * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): * * <pre> - * * MATCH_NORMAL // Normal (case-sensitive) matching * MATCH_CASEINDEPENDENT // Case folded comparisons * MATCH_MULTILINE // Newline matches as BOL/EOL - * * </pre> * * @see RECompiler @@ -465,6 +466,7 @@ /** * Construct a matcher for a pre-compiled regular expression from program * (bytecode) data. + * * @param program Compiled regular expression program * @see RECompiler * @see recompile @@ -485,6 +487,7 @@ /** * Converts a 'simplified' regular expression to a full regular expression + * * @param pattern The pattern to convert * @return The full regular expression */ @@ -527,13 +530,10 @@ * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): * * <pre> - * * MATCH_NORMAL // Normal (case-sensitive) matching * MATCH_CASEINDEPENDENT // Case folded comparisons * MATCH_MULTILINE // Newline matches as BOL/EOL - * * </pre> - * */ public void setMatchFlags(int matchFlags) { @@ -545,15 +545,12 @@ * @return Current match behaviour flags (RE.MATCH_*). * * <pre> - * * MATCH_NORMAL // Normal (case-sensitive) matching * MATCH_CASEINDEPENDENT // Case folded comparisons * MATCH_MULTILINE // Newline matches as BOL/EOL - * * </pre> * * @see #setMatchFlags - * */ public int getMatchFlags() { @@ -562,6 +559,7 @@ /** * Sets the current regular expression program used by this matcher object. + * * @param program Regular expression program compiled by RECompiler. * @see RECompiler * @see REProgram @@ -579,6 +577,7 @@ /** * Returns the current regular expression program in use by this matcher object. + * * @return Regular expression program * @see #setProgram */ @@ -589,6 +588,7 @@ /** * Returns the number of parenthesized subexpressions available after a successful match. + * * @return Number of available parenthesized subexpressions */ public int getParenCount() @@ -598,6 +598,7 @@ /** * Gets the contents of a parenthesized subexpression after a successful match. + * * @param which Nesting level of subexpression * @return String */ @@ -613,8 +614,9 @@ /** * Returns the start index of a given paren level. + * * @param which Nesting level of subexpression - * @return String index + * @return String index */ public final int getParenStart(int which) { @@ -624,13 +626,13 @@ { case 0: return start0; - + case 1: return start1; - + case 2: return start2; - + default: if (startn == null) { @@ -644,8 +646,9 @@ /** * Returns the end index of a given paren level. + * * @param which Nesting level of subexpression - * @return String index + * @return String index */ public final int getParenEnd(int which) { @@ -655,13 +658,13 @@ { case 0: return end0; - + case 1: return end1; - + case 2: return end2; - + default: if (endn == null) { @@ -675,6 +678,7 @@ /** * Returns the length of a given paren level. + * * @param which Nesting level of subexpression * @return Number of characters in the parenthesized subexpression */ @@ -689,6 +693,7 @@ /** * Sets the start of a paren level + * * @param which Which paren level * @param i Index in input array */ @@ -701,15 +706,15 @@ case 0: start0 = i; break; - + case 1: start1 = i; break; - + case 2: start2 = i; break; - + default: if (startn == null) { @@ -723,6 +728,7 @@ /** * Sets the end of a paren level + * * @param which Which paren level * @param i Index in input array */ @@ -735,15 +741,15 @@ case 0: end0 = i; break; - + case 1: end1 = i; break; - + case 2: end2 = i; break; - + default: if (endn == null) { @@ -759,6 +765,7 @@ * Throws an Error representing an internal error condition probably resulting * from a bug in the regular expression compiler (or possibly data corruption). * In practice, this should be very rare. + * * @param s Error description */ protected void internalError(String s) throws Error @@ -785,10 +792,11 @@ /** * Try to match a string against a subset of nodes in the program + * * @param firstNode Node to start at in program - * @param lastNode Last valid node (used for matching a subexpression without - * matching the rest of the program as well). - * @param idxStart Starting position in character array + * @param lastNode Last valid node (used for matching a subexpression without + * matching the rest of the program as well). + * @param idxStart Starting position in character array * @return Final input array index if match succeeded. -1 if not. */ protected int matchNodes(int firstNode, int lastNode, int idxStart) @@ -925,26 +933,14 @@ } // Case fold the backref? - if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) - { - // Compare backref to input, case-folding as we go - for (int i = 0; i < l; i++) - { - if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i))) - { - return -1; - } - } - } - else + final boolean caseFold = + ((matchFlags & MATCH_CASEINDEPENDENT) != 0); + // Compare backref to input + for (int i = 0; i < l; i++) { - // Compare backref to input - for (int i = 0; i < l; i++) + if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) { - if (search.charAt(idx++) != search.charAt(s + i)) - { - return -1; - } + return -1; } } } @@ -1096,24 +1092,14 @@ } // Match atom differently depending on casefolding flag - if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) - { - for (int i = 0; i < lenAtom; i++) - { - if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i])) - { - return -1; - } - } - } - else + final boolean caseFold = + ((matchFlags & MATCH_CASEINDEPENDENT) != 0); + + for (int i = 0; i < lenAtom; i++) { - for (int i = 0; i < lenAtom; i++) + if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) { - if (search.charAt(idx++) != instruction[startAtom + i]) - { - return -1; - } + return -1; } } } @@ -1126,7 +1112,7 @@ { return -1; } - + switch (opdata) { case POSIX_CLASS_ALNUM: @@ -1135,42 +1121,42 @@ return -1; } break; - + case POSIX_CLASS_ALPHA: if (!Character.isLetter(search.charAt(idx))) { return -1; } break; - + case POSIX_CLASS_DIGIT: if (!Character.isDigit(search.charAt(idx))) { return -1; } break; - + case POSIX_CLASS_BLANK: // JWL - bugbug: is this right?? if (!Character.isSpaceChar(search.charAt(idx))) { return -1; } break; - + case POSIX_CLASS_SPACE: if (!Character.isWhitespace(search.charAt(idx))) { return -1; } break; - + case POSIX_CLASS_CNTRL: if (Character.getType(search.charAt(idx)) != Character.CONTROL) { return -1; } break; - + case POSIX_CLASS_GRAPH: // JWL - bugbug??? switch (Character.getType(search.charAt(idx))) { @@ -1179,33 +1165,33 @@ case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: break; - + default: return -1; } break; - + case POSIX_CLASS_LOWER: if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) { return -1; } break; - + case POSIX_CLASS_UPPER: if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) { return -1; } break; - + case POSIX_CLASS_PRINT: if (Character.getType(search.charAt(idx)) == Character.CONTROL) { return -1; } break; - + case POSIX_CLASS_PUNCT: { int type = Character.getType(search.charAt(idx)); @@ -1217,7 +1203,7 @@ case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: break; - + default: return -1; } @@ -1235,14 +1221,14 @@ } } break; - + case POSIX_CLASS_JSTART: if (!Character.isJavaIdentifierStart(search.charAt(idx))) { return -1; } break; - + case POSIX_CLASS_JPART: if (!Character.isJavaIdentifierPart(search.charAt(idx))) { @@ -1254,7 +1240,7 @@ internalError("Bad posix class"); break; } - + // Matched. idx++; } @@ -1271,34 +1257,18 @@ // Get character to match against character class and maybe casefold char c = search.charAt(idx); boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; - if (caseFold) - { - c = Character.toLowerCase(c); - } - // Loop through character class checking our match character int idxRange = node + nodeSize; int idxEnd = idxRange + (opdata * 2); boolean match = false; - for (int i = idxRange; i < idxEnd; ) + for (int i = idxRange; !match && i < idxEnd; ) { // Get start, end and match characters char s = instruction[i++]; char e = instruction[i++]; - // Fold ends of range and match character - if (caseFold) - { - s = Character.toLowerCase(s); - e = Character.toLowerCase(e); - } - - // If the match character is in range, break out - if (c >= s && c <= e) - { - match = true; - break; - } + match = ((compareChars(c, s, caseFold) >= 0) + && (compareChars(c, e, caseFold) <= 0)); } // Fail if we didn't match the character class @@ -1329,7 +1299,7 @@ { return idxNew; } - + // Go to next branch (if any) nextBranch = (short)instruction[node + offsetNext]; node += nextBranch; @@ -1371,6 +1341,7 @@ * Match the current regular expression program against the current * input string, starting at index i of the input string. This method * is only meant for internal use. + * * @param i The input string index to start matching at * @return True if the input matched the expression */ @@ -1411,11 +1382,12 @@ /** * Matches the current regular expression program against a character array, * starting at a given index. + * * @param search String to match against * @param i Index to start searching at * @return True if string matched */ - public boolean match(String search, int i) + public boolean match(String search, int i) { return match(new StringCharacterIterator(search), i); } @@ -1423,6 +1395,7 @@ /** * Matches the current regular expression program against a character array, * starting at a given index. + * * @param search String to match against * @param i Index to start searching at * @return True if string matched @@ -1459,44 +1432,25 @@ // Prefix-anchored matching is possible boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0; char[] prefix = program.prefix; - for ( ;! search.isEnd(i + prefix.length - 1); i++) + for ( ; !search.isEnd(i + prefix.length - 1); i++) { - // If the first character of the prefix matches - boolean match = false; - if (caseIndependent) - match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]); - else - match = search.charAt(i) == prefix[0]; - if (match) - { - // Save first character position - int firstChar = i++; - int k; - for (k = 1; k < prefix.length; ) - { - // If there's a mismatch of any character in the prefix, give up - if (caseIndependent) - match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]); - else - match = search.charAt(i++) == prefix[k++]; - if (!match) - { - break; - } - } + int j = i; + int k = 0; - // See if the whole prefix string matched - if (k == prefix.length) + boolean match; + do { + // If there's a mismatch of any character in the prefix, give up + match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0); + } while (match && k < prefix.length); + + // See if the whole prefix string matched + if (k == prefix.length) + { + // We matched the full prefix at firstChar, so try it + if (matchAt(i)) { - // We matched the full prefix at firstChar, so try it - if (matchAt(firstChar)) - { - return true; - } + return true; } - - // Match failed, reset i to continue the search - i = firstChar; } } return false; @@ -1505,6 +1459,7 @@ /** * Matches the current regular expression program against a String. + * * @param search String to match against * @return True if string matched */ @@ -1520,7 +1475,7 @@ * "xyzzyababbayyzabbbab123", the result would be the array of Strings * "[xyzzy, yyz, 123]". * - * Please note that the first string in the resulting array may be an empty + * <p>Please note that the first string in the resulting array may be an empty * string. This happens when the very first character of input string is * matched by the pattern. * @@ -1620,7 +1575,7 @@ * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+", * a String to substituteIn of "visit us: http://www.apache.org!" and the * substitution String "<a href=\"$0\">$0</a>", the resulting String - * returned by subst would be + * returned by subst would be * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!". * <p> * <i>Note:</i> $0 represents the whole match. @@ -1705,7 +1660,7 @@ // Move forward, skipping past match int newpos = getParenEnd(0); - // We always want to make progress! + // We always want to make progress! if (newpos == pos) { newpos++; @@ -1727,16 +1682,17 @@ ret.append(substituteIn.substring(pos)); } - // Return string buffer as string + // Return string buffer as string return ret.toString(); - } + } /** * Returns an array of Strings, whose toString representation matches a regular * expression. This method works like the Perl function of the same name. Given * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz, * aaaab], the array of Strings returned by grep would be [aab, aaaab]. - * @param search Array of Objects to search + * + * @param search Array of Objects to search * @return Array of Strings whose toString() value matches this regular expression. */ public String[] grep(Object[] search) @@ -1763,8 +1719,11 @@ return ret; } - /** @return true if at the i-th position in the 'search' a newline ends */ - private boolean isNewline(int i) { + /** + * @return true if character at i-th position in the <code>search</code> string is a newline + */ + private boolean isNewline(int i) + { char nextChar = search.charAt(i); if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' @@ -1774,5 +1733,24 @@ } return false; + } + + /** + * Compares two characters. + * + * @param c1 first character to compare. + * @param c2 second character to compare. + * @param caseIndependent whether comparision is case insensitive or not. + * @return negative, 0, or positive integer as the first character + * less than, equal to, or greater then the second. + */ + private int compareChars(char c1, char c2, boolean caseIndependent) + { + if (caseIndependent) + { + c1 = Character.toLowerCase(c1); + c2 = Character.toLowerCase(c2); + } + return ((int)c1 - (int)c2); } } 1.14 +24 -4 jakarta-regexp/src/java/org/apache/regexp/RETest.java Index: RETest.java =================================================================== RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RETest.java,v retrieving revision 1.13 retrieving revision 1.14 diff -u -r1.13 -r1.14 --- RETest.java 27 Feb 2004 02:41:20 -0000 1.13 +++ RETest.java 20 Mar 2004 14:35:42 -0000 1.14 @@ -358,6 +358,26 @@ showParens(r); } + r = new RE("(A*)b\\1"); + r.setMatchFlags(RE.MATCH_CASEINDEPENDENT); + if (!r.match("AaAaaaBAAAAAA")) + { + fail("Did not match 'AaAaaaBAAAAAA'."); + } else { + say("AaAaaaBAAAAAA = true"); + showParens(r); + } + + r = new RE("[A-Z]*"); + r.setMatchFlags(RE.MATCH_CASEINDEPENDENT); + if (!r.match("CaBgDe12")) + { + fail("Did not match 'CaBgDe12'."); + } else { + say("CaBgDe12 = true"); + showParens(r); + } + // Test MATCH_MULTILINE. Test for eol/bol symbols. r = new RE("^abc$", RE.MATCH_MULTILINE); if (!r.match("\nabc")) { @@ -602,7 +622,7 @@ boolean shouldMatch = false; int expectedParenCount = 0; String[] expectedParens = null; - + if (!badPattern) { shouldMatch = getExpectedResult(br.readLine().trim()); if (shouldMatch) { @@ -769,7 +789,7 @@ private boolean checkParens() { // Show subexpression registers - if (test.showSuccesses) + if (RETest.showSuccesses) { test.showParens(regexp); } @@ -850,7 +870,7 @@ */ void success(String s) { - if (test.showSuccesses) + if (RETest.showSuccesses) { test.say("" + RETest.NEW_LINE + "-----------------------" + RETest.NEW_LINE + ""); test.say("Expression #" + (number) + " \"" + pattern + "\" ");
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]