bayard 2002/11/18 00:46:02 Modified: codec/src/java/org/apache/commons/codec Metaphone.java Soundex.java Added: codec/src/java/org/apache/commons/codec Encoder.java EncoderComparator.java RefinedSoundex.java Removed: codec/src/java/org/apache/commons/codec SoundexComparator.java Log: A common interface added, a new algorithm added [refined soundex] and a generic comparator introduced. Submitted-by: "O'brien, Tim" <[EMAIL PROTECTED]> Revision Changes Path 1.3 +320 -293 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java Index: Metaphone.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Metaphone.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Metaphone.java 21 Oct 2002 19:05:19 -0000 1.2 +++ Metaphone.java 18 Nov 2002 08:46:02 -0000 1.3 @@ -1,293 +1,320 @@ -// Permission given by wbrogden for code to be used anywhere. -package org.apache.commons.codec; - - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Commons" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact [EMAIL PROTECTED] - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Turbine", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * <http://www.apache.org/>. - */ - -/* Metaphone.java - * A class to generate phonetic code and keep lists of objects - * retrievable by a phonetic code. - * reference: Computer Language of Dec. 1990, p 39 - * "Hanging on the Metaphone" by Lawrence Philips - * - * This Java implementation, William B. Brogden. December, 1997 - * - * @author [EMAIL PROTECTED] - */ - -/* - * List functionality removed: 2001-06-21 [EMAIL PROTECTED] - */ - -/* - * Notes: - * The static method metaphone converts an input String into a code. - * All input is converted to upper case. - * Limitations: Input format is expected to be a single ASCII word - * with only characters in the A - Z range, no punctuation or numbers. - * - */ - -class Metaphone { - - static String vowels = "AEIOU" ; - static String frontv = "EIY" ; - static String varson = "CSPTG" ; - - static final int maxCodeLen = 4 ; - - /** - * Are the metaphones of two strings the same. - */ - static public boolean isMetaphoneEqual(String str1, String str2) { - return metaphone(str1).equals(metaphone(str2)); - } - - /** - * Find the metaphone value of a String. This is similar to the - * soundex algorithm, but better at finding similar sounding words. - */ - static public String metaphone( String txt ){ - int mtsz = 0 ; - boolean hard = false ; - if(( txt == null ) || - ( txt.length() == 0 )) return "" ; - // single character is itself - if( txt.length() == 1 ) return txt.toUpperCase() ; - - char[] inwd = txt.toUpperCase().toCharArray() ; - - String tmpS ; - StringBuffer local = new StringBuffer( 40 ); // manipulate - StringBuffer code = new StringBuffer( 10 ) ; // output - // handle initial 2 characters exceptions - switch( inwd[0] ){ - case 'K': case 'G' : case 'P' : /* looking for KN, etc*/ - if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 ); - else local.append( inwd ); - break; - case 'A': /* looking for AE */ - if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 ); - else local.append( inwd ); - break; - case 'W' : /* looking for WR or WH */ - if( inwd[1] == 'R' ){ // WR -> R - local.append(inwd, 1, inwd.length - 1 ); break ; - } - if( inwd[1] == 'H'){ - local.append(inwd, 1, inwd.length - 1 ); - local.setCharAt( 0,'W'); // WH -> W - } - else local.append( inwd ); - break; - case 'X' : /* initial X becomes S */ - inwd[0] = 'S' ;local.append( inwd ); - break ; - default : - local.append( inwd ); - } // now local has working string with initials fixed - int wdsz = local.length(); - int n = 0 ; - while((mtsz < maxCodeLen ) && // max code size of 4 works well - (n < wdsz ) ){ - char symb = local.charAt(n) ; - // remove duplicate letters except C - if(( symb != 'C' ) && - (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ; - else{ // not dup - switch( symb ){ - case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : - if( n == 0 ) { code.append(symb );mtsz++; - } - break ; // only use vowel if leading char - case 'B' : - if( (n > 0 ) && - !(n + 1 == wdsz ) && // not MB at end of word - ( local.charAt(n - 1) == 'M')) { - code.append(symb); - } - else code.append(symb); - mtsz++ ; - break ; - case 'C' : // lots of C special cases - /* discard if SCI, SCE or SCY */ - if( ( n > 0 ) && - ( local.charAt(n-1) == 'S' ) && - ( n + 1 < wdsz ) && - ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;} - tmpS = local.toString(); - if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X - code.append('X' ); mtsz++; break ; - } - if( ( n + 1 < wdsz ) && - (frontv.indexOf( local.charAt(n+1) )>= 0 )){ - code.append('S');mtsz++; break ; // CI,CE,CY -> S - } - if(( n > 0) && - ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk - code.append('K') ; mtsz++;break ; - } - if( tmpS.indexOf("CH", n ) == n ){ // detect CH - if((n == 0 ) && - (wdsz >= 3 ) && // CH consonant -> K consonant - (vowels.indexOf( local.charAt( 2) ) < 0 )){ - code.append('K'); - } - else { code.append('X'); // CHvowel -> X - } - mtsz++; - } - else { code.append('K' );mtsz++; - } - break ; - case 'D' : - if(( n + 2 < wdsz )&& // DGE DGI DGY -> J - ( local.charAt(n+1) == 'G' )&& - (frontv.indexOf( local.charAt(n+2) )>= 0)){ - code.append('J' ); n += 2 ; - } - else { code.append( 'T' ); - } - mtsz++; - break ; - case 'G' : // GH silent at end or before consonant - if(( n + 2 == wdsz )&& - (local.charAt(n+1) == 'H' )) break ; - if(( n + 2 < wdsz ) && - (local.charAt(n+1) == 'H' )&& - (vowels.indexOf( local.charAt(n+2)) < 0 )) break ; - tmpS = local.toString(); - if((n > 0) && - ( tmpS.indexOf("GN", n ) == n)|| - ( tmpS.indexOf("GNED",n) == n )) break ; // silent G - if(( n > 0 ) && - (local.charAt(n-1) == 'G')) hard = true ; - else hard = false ; - if((n+1 < wdsz) && - (frontv.indexOf( local.charAt(n+1) ) >= 0 )&& - (!hard) ) code.append( 'J' ); - else code.append('K'); - mtsz++; - break ; - case 'H': - if( n + 1 == wdsz ) break ; // terminal H - if((n > 0) && - (varson.indexOf( local.charAt(n-1)) >= 0)) break ; - if( vowels.indexOf( local.charAt(n+1)) >=0 ){ - code.append('H') ; mtsz++;// Hvowel - } - break; - case 'F': case 'J' : case 'L' : - case 'M': case 'N' : case 'R' : - code.append( symb ); mtsz++; break ; - case 'K' : - if( n > 0 ){ // not initial - if( local.charAt( n -1) != 'C' ) { - code.append(symb ); - } - } - else code.append( symb ); // initial K - mtsz++ ; - break ; - case 'P' : - if((n + 1 < wdsz) && // PH -> F - (local.charAt( n+1) == 'H'))code.append('F'); - else code.append( symb ); - mtsz++; - break ; - case 'Q' : - code.append('K' );mtsz++; break ; - case 'S' : - tmpS = local.toString(); - if((tmpS.indexOf("SH", n )== n) || - (tmpS.indexOf("SIO",n )== n) || - (tmpS.indexOf("SIA",n )== n)) code.append('X'); - else code.append( 'S' ); - mtsz++ ; - break ; - case 'T' : - tmpS = local.toString(); // TIA TIO -> X - if((tmpS.indexOf("TIA",n )== n)|| - (tmpS.indexOf("TIO",n )== n) ){ - code.append('X'); mtsz++; break; - } - if( tmpS.indexOf("TCH",n )==n) break; - // substitute numeral 0 for TH (resembles theta after all) - if( tmpS.indexOf("TH", n )==n) code.append('0'); - else code.append( 'T' ); - mtsz++ ; - break ; - case 'V' : - code.append('F'); mtsz++;break ; - case 'W' : case 'Y' : // silent if not followed by vowel - if((n+1 < wdsz) && - (vowels.indexOf( local.charAt(n+1))>=0)){ - code.append( symb );mtsz++; - } - break ; - case 'X' : - code.append('K'); code.append('S');mtsz += 2; - break ; - case 'Z' : - code.append('S'); mtsz++; break ; - } // end switch - n++ ; - } // end else from symb != 'C' - if( mtsz > 4 )code.setLength( 4); - } - return code.toString(); - } // end static method metaPhone() - -} +// Permission given by wbrogden for code to be used anywhere. +package org.apache.commons.codec; + + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Commons" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact [EMAIL PROTECTED] + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Turbine", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + +/* Metaphone.java + * A class to generate phonetic code and keep lists of objects + * retrievable by a phonetic code. + * reference: Computer Language of Dec. 1990, p 39 + * "Hanging on the Metaphone" by Lawrence Philips + * + * This Java implementation, William B. Brogden. December, 1997 + * + * @author [EMAIL PROTECTED] + * @author [EMAIL PROTECTED] + */ + +/* + * List functionality removed: 2001-06-21 [EMAIL PROTECTED] + */ + +/* + * Notes: + * The static method metaphone converts an input String into a code. + * All input is converted to upper case. + * Limitations: Input format is expected to be a single ASCII word + * with only characters in the A - Z range, no punctuation or numbers. + * + */ + +public class Metaphone implements Encoder { + + private String vowels = "AEIOU" ; + private String frontv = "EIY" ; + private String varson = "CSPTG" ; + + private int maxCodeLen = 4 ; + + public Metaphone() { + super(); + } + + + /** + * Find the metaphone value of a String. This is similar to the + * soundex algorithm, but better at finding similar sounding words. + */ + public String metaphone( String txt ){ + int mtsz = 0 ; + boolean hard = false ; + if(( txt == null ) || + ( txt.length() == 0 )) return "" ; + // single character is itself + if( txt.length() == 1 ) return txt.toUpperCase() ; + + char[] inwd = txt.toUpperCase().toCharArray() ; + + String tmpS ; + StringBuffer local = new StringBuffer( 40 ); // manipulate + StringBuffer code = new StringBuffer( 10 ) ; // output + // handle initial 2 characters exceptions + switch( inwd[0] ){ + case 'K': case 'G' : case 'P' : /* looking for KN, etc*/ + if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 ); + else local.append( inwd ); + break; + case 'A': /* looking for AE */ + if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 ); + else local.append( inwd ); + break; + case 'W' : /* looking for WR or WH */ + if( inwd[1] == 'R' ){ // WR -> R + local.append(inwd, 1, inwd.length - 1 ); break ; + } + if( inwd[1] == 'H'){ + local.append(inwd, 1, inwd.length - 1 ); + local.setCharAt( 0,'W'); // WH -> W + } + else local.append( inwd ); + break; + case 'X' : /* initial X becomes S */ + inwd[0] = 'S' ;local.append( inwd ); + break ; + default : + local.append( inwd ); + } // now local has working string with initials fixed + int wdsz = local.length(); + int n = 0 ; + while((mtsz < maxCodeLen ) && // max code size of 4 works well + (n < wdsz ) ){ + char symb = local.charAt(n) ; + // remove duplicate letters except C + if(( symb != 'C' ) && + (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ; + else{ // not dup + switch( symb ){ + case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : + if( n == 0 ) { code.append(symb );mtsz++; + } + break ; // only use vowel if leading char + case 'B' : + if( (n > 0 ) && + !(n + 1 == wdsz ) && // not MB at end of word + ( local.charAt(n - 1) == 'M')) { + code.append(symb); + } + else code.append(symb); + mtsz++ ; + break ; + case 'C' : // lots of C special cases + /* discard if SCI, SCE or SCY */ + if( ( n > 0 ) && + ( local.charAt(n-1) == 'S' ) && + ( n + 1 < wdsz ) && + ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;} + tmpS = local.toString(); + if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X + code.append('X' ); mtsz++; break ; + } + if( ( n + 1 < wdsz ) && + (frontv.indexOf( local.charAt(n+1) )>= 0 )){ + code.append('S');mtsz++; break ; // CI,CE,CY -> S + } + if(( n > 0) && + ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk + code.append('K') ; mtsz++;break ; + } + if( tmpS.indexOf("CH", n ) == n ){ // detect CH + if((n == 0 ) && + (wdsz >= 3 ) && // CH consonant -> K consonant + (vowels.indexOf( local.charAt( 2) ) < 0 )){ + code.append('K'); + } + else { code.append('X'); // CHvowel -> X + } + mtsz++; + } + else { code.append('K' );mtsz++; + } + break ; + case 'D' : + if(( n + 2 < wdsz )&& // DGE DGI DGY -> J + ( local.charAt(n+1) == 'G' )&& + (frontv.indexOf( local.charAt(n+2) )>= 0)){ + code.append('J' ); n += 2 ; + } + else { code.append( 'T' ); + } + mtsz++; + break ; + case 'G' : // GH silent at end or before consonant + if(( n + 2 == wdsz )&& + (local.charAt(n+1) == 'H' )) break ; + if(( n + 2 < wdsz ) && + (local.charAt(n+1) == 'H' )&& + (vowels.indexOf( local.charAt(n+2)) < 0 )) break ; + tmpS = local.toString(); + if((n > 0) && + ( tmpS.indexOf("GN", n ) == n)|| + ( tmpS.indexOf("GNED",n) == n )) break ; // silent G + if(( n > 0 ) && + (local.charAt(n-1) == 'G')) hard = true ; + else hard = false ; + if((n+1 < wdsz) && + (frontv.indexOf( local.charAt(n+1) ) >= 0 )&& + (!hard) ) code.append( 'J' ); + else code.append('K'); + mtsz++; + break ; + case 'H': + if( n + 1 == wdsz ) break ; // terminal H + if((n > 0) && + (varson.indexOf( local.charAt(n-1)) >= 0)) break ; + if( vowels.indexOf( local.charAt(n+1)) >=0 ){ + code.append('H') ; mtsz++;// Hvowel + } + break; + case 'F': case 'J' : case 'L' : + case 'M': case 'N' : case 'R' : + code.append( symb ); mtsz++; break ; + case 'K' : + if( n > 0 ){ // not initial + if( local.charAt( n -1) != 'C' ) { + code.append(symb ); + } + } + else code.append( symb ); // initial K + mtsz++ ; + break ; + case 'P' : + if((n + 1 < wdsz) && // PH -> F + (local.charAt( n+1) == 'H'))code.append('F'); + else code.append( symb ); + mtsz++; + break ; + case 'Q' : + code.append('K' );mtsz++; break ; + case 'S' : + tmpS = local.toString(); + if((tmpS.indexOf("SH", n )== n) || + (tmpS.indexOf("SIO",n )== n) || + (tmpS.indexOf("SIA",n )== n)) code.append('X'); + else code.append( 'S' ); + mtsz++ ; + break ; + case 'T' : + tmpS = local.toString(); // TIA TIO -> X + if((tmpS.indexOf("TIA",n )== n)|| + (tmpS.indexOf("TIO",n )== n) ){ + code.append('X'); mtsz++; break; + } + if( tmpS.indexOf("TCH",n )==n) break; + // substitute numeral 0 for TH (resembles theta after all) + if( tmpS.indexOf("TH", n )==n) code.append('0'); + else code.append( 'T' ); + mtsz++ ; + break ; + case 'V' : + code.append('F'); mtsz++;break ; + case 'W' : case 'Y' : // silent if not followed by vowel + if((n+1 < wdsz) && + (vowels.indexOf( local.charAt(n+1))>=0)){ + code.append( symb );mtsz++; + } + break ; + case 'X' : + code.append('K'); code.append('S');mtsz += 2; + break ; + case 'Z' : + code.append('S'); mtsz++; break ; + } // end switch + n++ ; + } // end else from symb != 'C' + if( mtsz > 4 )code.setLength( 4); + } + return code.toString(); + } // end static method metaPhone() + + public String encode(String pString) { + return( metaphone( pString ) ); + } + + /** + * Are the metaphones of two strings the same. + */ + public boolean isMetaphoneEqual(String str1, String str2) { + return metaphone(str1).equals(metaphone(str2)); + } + + /** + * Returns the maxCodeLen. + * @return int + */ + public int getMaxCodeLen() { + return maxCodeLen; + } + + /** + * Sets the maxCodeLen. + * @param maxCodeLen The maxCodeLen to set + */ + public void setMaxCodeLen(int maxCodeLen) { + this.maxCodeLen = maxCodeLen; + } + +} + 1.2 +143 -115 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java Index: Soundex.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Soundex.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- Soundex.java 22 Feb 2002 05:20:00 -0000 1.1 +++ Soundex.java 18 Nov 2002 08:46:02 -0000 1.2 @@ -1,115 +1,143 @@ -package org.apache.commons.codec; - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Commons" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact [EMAIL PROTECTED] - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Turbine", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * <http://www.apache.org/>. - */ - -/** - * Find the Soundex of a string. Needs internationalisation in a - * future release. - * - * @author [EMAIL PROTECTED] - * @version $Id$ - */ -public class Soundex { - - static public final char[] US_ENGLISH_MAPPING = - "01230120022455012623010202".toCharArray(); - - static public final Soundex US_ENGLISH = new Soundex(); - - private char[] soundexMapping; - - public Soundex() { - this(US_ENGLISH_MAPPING); - } - - public Soundex(char[] mapping) { - this.soundexMapping = mapping; - } - - /** - * Get the SoundEx value of a string. - * This implementation is taken from the code-snippers on - * http://www.sourceforge.net/ - */ - public String soundex(String str) { - char out[] = { '0', '0', '0', '0' }; - char last, mapped; - int incount = 1, count = 1; - out[0] = Character.toUpperCase( str.charAt(0) ); - last = getMappingCode( str.charAt(0) ); - while( (incount < str.length() ) && - (mapped = getMappingCode(str.charAt(incount++))) != 0 && - (count < 4) ) - { - if( (mapped != '0') && (mapped != last) ) { - out[count++] = mapped; - } - last = mapped; - } - return new String(out); - } - - /** - * Used internally by the SoundEx algorithm. - */ - private char getMappingCode(char c) { - if( !Character.isLetter(c) ) { - return 0; - } else { - return soundexMapping[Character.toUpperCase(c) - 'A']; - } - } - -} +package org.apache.commons.codec; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Commons" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact [EMAIL PROTECTED] + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Turbine", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + +/** + * Encodes a string into a refined soundex value. + * A refined soundex code is optimized for spell checking word. + * "Soundex" method originally developed by Margaret Odell and + * Robert Russell + * + * http://www.bluepoof.com/Soundex/info2.html + * + * @todo Needs internationalisation in a future release. + * + * @author [EMAIL PROTECTED] + * @author [EMAIL PROTECTED] + * @version $Id$ + */ +public class Soundex implements Encoder { + + static public final char[] US_ENGLISH_MAPPING = + "01230120022455012623010202".toCharArray(); + + static public final Soundex US_ENGLISH = new Soundex(); + + private char[] soundexMapping; + private int maxLength = 4; + + public Soundex() { + this(US_ENGLISH_MAPPING); + } + + public Soundex(char[] mapping) { + this.soundexMapping = mapping; + } + + /** + * Get the SoundEx value of a string. + * This implementation is taken from the code-snippers on + * http://www.sourceforge.net/ + */ + public String soundex(String str) { + char out[] = { '0', '0', '0', '0' }; + char last, mapped; + int incount = 1, count = 1; + out[0] = Character.toUpperCase( str.charAt(0) ); + last = getMappingCode( str.charAt(0) ); + while( (incount < str.length() ) && + (mapped = getMappingCode(str.charAt(incount++))) != 0 && + (count < maxLength) ) + { + if( (mapped != '0') && (mapped != last) ) { + out[count++] = mapped; + } + last = mapped; + } + return new String(out); + } + + public String encode(String pString) { + return( soundex( pString ) ); + } + + /** + * Used internally by the SoundEx algorithm. + */ + private char getMappingCode(char c) { + if( !Character.isLetter(c) ) { + return 0; + } else { + return soundexMapping[Character.toUpperCase(c) - 'A']; + } + } + + /** + * Returns the maxLength. Standard Soundex + * @return int + */ + public int getMaxLength() { + return maxLength; + } + + /** + * Sets the maxLength. + * @param maxLength The maxLength to set + */ + public void setMaxLength(int maxLength) { + this.maxLength = maxLength; + } + +} 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/Encoder.java Index: Encoder.java =================================================================== package org.apache.commons.codec; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Turbine", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ /** * Encoder is an interface, which is implemented by Soundex, * Metaphone, Soundex2, etc. * * @author [EMAIL PROTECTED] * @version $Id: Encoder.java,v 1.1 2002/11/18 08:46:02 bayard Exp $ */ public interface Encoder { public String encode(String str); } 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/EncoderComparator.java Index: EncoderComparator.java =================================================================== package org.apache.commons.codec; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Turbine", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import java.util.Comparator; /** * Compare using an Encoder. * * @author [EMAIL PROTECTED] * @version $Id: EncoderComparator.java,v 1.1 2002/11/18 08:46:02 bayard Exp $ */ public class EncoderComparator implements Comparator { private Encoder encoder; /** * Use the default soundex algorithm, US_ENGLISH. */ public EncoderComparator() { this(RefinedSoundex.US_ENGLISH); } /** * Use the provided soundex algorithm. */ public EncoderComparator(Encoder en) { this.encoder = en; } public int compare(Object o1, Object o2) { String s1 = encoder.encode(o1.toString()); String s2 = encoder.encode(o2.toString()); return s1.compareTo(s2); } } 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/RefinedSoundex.java Index: RefinedSoundex.java =================================================================== package org.apache.commons.codec; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Turbine", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ /** * Encodes a string into a soundex value. Sounde is an encoding used to * relate similar names, but can also be used as a general purpose * scheme to find word with similar phonemes. * More information may be found at: http://www.bluepoof.com/Soundex/info2.html * * @todo Needs internationalisation in a future release. * * @author [EMAIL PROTECTED] * @version $Id: RefinedSoundex.java,v 1.1 2002/11/18 08:46:02 bayard Exp $ */ public class RefinedSoundex implements Encoder { static public final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray(); static public final RefinedSoundex US_ENGLISH = new RefinedSoundex(); private char[] soundexMapping; public RefinedSoundex() { this(US_ENGLISH_MAPPING); } public RefinedSoundex(char[] mapping) { this.soundexMapping = mapping; } /** * Get the SoundEx value of a string. * This implementation is taken from the code-snippers on * http://www.sourceforge.net/ */ public String soundex(String str) { StringBuffer sBuf = new StringBuffer(); str = str.toUpperCase(); sBuf.append( str.charAt(0) ); char last, mapped, current; last = '*'; for( int i = 0; i < str.length(); i++ ) { current = getMappingCode( str.charAt(i) ); if( current == last ) { continue; } else if( current != 0 ) { sBuf.append( current ); } last = current; } return sBuf.toString(); } public String encode(String pString) { return( soundex( pString ) ); } /** * Used internally by the SoundEx algorithm. */ private char getMappingCode(char c) { if( !Character.isLetter(c) ) { return 0; } else { return soundexMapping[Character.toUpperCase(c) - 'A']; } } }
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>