Hi!! I investigated Perl5.6 [:punct:], [:graph:] and [:print:] more detail. (ORO goal is compatibility with Perl5.6.) Now ORO's matching characters are a little different from Perl5.6. I compared Perl5.6's matching result and Java's Unicode Block. [:print:] - Following unicode blocks are NOT matching Perl5.6's [:print:] Character.CONTROL Character.FORMAT Character.SURROGATE Character.PRIVATE_USE Others are matching. - And some of Character.UNASSIGNED are matching [:print:]. But we can ignore them because they are not defined unicode block. (these characters are used for only special purpose.) [:graph:] [:graph:] is excepted only U+0020 from [:print:] characters. [:punct:] - Following unicode blocks are matching Perl5.6's [:punct:] Character.DASH_PUNCTUATION Character.START_PUNCTUATION Character.END_PUNCTUATION Character.CONNECTOR_PUNCTUATION - Some of Character.OTHER_PUNCTUATION characters are matching ,but followig characters are NOT matching it. U+0374, U+0375, U+0E2F, U+0EAF, U+3006 I attached patch including above result. However it makes code a little tricky and user may not require such a detail compatibility. Is it nitpick?;) Regards. --------------------- Takashi Okamoto --- Perl5Matcher.java.orig Sat Mar 31 23:01:12 2001 +++ Perl5Matcher.java Sun Apr 1 11:33:55 2001 @@ -682,25 +682,58 @@ if(Character.isUpperCase(code)) return isANYOF; break; case OpCode._PRINT: - if(Character.isSpaceChar(code)) return isANYOF; - // Fall through to check if the character is alphanumeric, - // or a punctuation mark. Printable characters are either - // alphanumeric, punctuation marks, or spaces. + switch( Character.getType(code) ) { + // Following unicode blocks do NOT match [:print:]. + case Character.UNASSIGNED: + case Character.CONTROL: + case Character.FORMAT: + case Character.SURROGATE: + case Character.PRIVATE_USE: + break; + default: + // Others match. + return isANYOF; + } case OpCode._GRAPH: - if(Character.isLetterOrDigit(code)) return isANYOF; - // Fall through to check if the character is a punctuation mark. - // Graph characters are either alphanumeric or punctuation. + switch ( Character.getType(code) ) { + // Following unicode blocks do NOT match [:graph:]. + case Character.UNASSIGNED: + case Character.CONTROL: + case Character.FORMAT: + case Character.SURROGATE: + case Character.PRIVATE_USE: + break; + default: + // Others match except U+0020. + if ( code != 0x0020 ) + return isANYOF; + break; + } case OpCode._PUNCT: switch ( Character.getType(code) ) { + // Following unicode blocks match [:punct:]. case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: - case Character.OTHER_PUNCTUATION: return isANYOF; - default: - break; + case Character.OTHER_PUNCTUATION: + switch ( code ) { + // following OTHER_PUNCTUATION characters don't match + // Perl5.6's [:punct:] + case 0x0374: + case 0x0375: + case 0x0e2f: + case 0x0eaf: + case 0x3006: + break; + default: + // other OTHER_PUNCTUATION characters match. + return isANYOF; } + default: + break; + } break; case OpCode._XDIGIT: if( (code >= '0' && code <= '9') ||

Reply via email to