tobrien 2003/02/03 20:43:59 Modified: codec/src/java/org/apache/commons/codec/language DoubleMetaphone.java Nysiis.java Added: codec/src/java/org/apache/commons/codec/language Metaphone.java RefinedSoundex.java Soundex.java Log: Moved language codecs to language package Revision Changes Path 1.3 +2 -2 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java Index: DoubleMetaphone.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- DoubleMetaphone.java 3 Feb 2003 16:03:54 -0000 1.2 +++ DoubleMetaphone.java 4 Feb 2003 04:43:58 -0000 1.3 @@ -62,7 +62,7 @@ * <b>PLEASE NOTE:</b> This implementation is not thread-safe. Please * see TODO list for [codec] - Tim O'Brien * - * @see http://www.cse.ucsc.edu/~eparrish/toolbox/search.html + * @see <a href="http://www.cse.ucsc.edu/~eparrish/toolbox/search.html">Ed Parrish, Developing a Search Engine</a> * * @version $Revision$ * @author Ed Parish 1.3 +4 -8 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/Nysiis.java Index: Nysiis.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/Nysiis.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Nysiis.java 3 Feb 2003 16:02:16 -0000 1.2 +++ Nysiis.java 4 Feb 2003 04:43:58 -0000 1.3 @@ -61,13 +61,9 @@ * the code from the Perl module available from CPAN, which derives from an * implementation by Ben Kennedy. * - * @see http://www.nist.gov/dads/HTML/nysiis.html - * @see http://search.cpan.org/search?query=nysiis&mode=all - * - * @see Atack, J., and F. Bateman. 1992 . - * <i>"Matchmaker, matchmaker, make me a match"</i> : a general - * computer-based matching program for historical researc. - * Historical Methods 25: 53-65. + * @see <a href="http://www.nist.gov/dads/HTML/nysiis.html">NIST Nysiis definition</a> + * @see <a href="http://search.cpan.org/search?query=nysiis&mode=all">Nysiis implementation at CPAN</a> + * @see <a href="http://www.ssha.org/mainsite/news/methodsad.html">Atack, J., and F. Bateman. 1992 . <i>"Matchmaker, matchmaker, make me a match"</i> : a general computer-based matching program for historical researc. Historical Methods 25: 53-65.</a> * * @version $Revision$ * @author <a href="[EMAIL PROTECTED]">Kyle R. Burton</a> 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/Metaphone.java Index: Metaphone.java =================================================================== /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001-2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Commons", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package org.apache.commons.codec.language; import org.apache.commons.codec.Encoder; /** * A class to generate phonetic code. * The initial Java implementation, William B. Brogden. December, 1997 * Permission given by wbrogden for code to be used anywhere. * * "Hanging on the Metaphone" by Lawrence Philips * <i>Computer Language</i> of Dec. 1990, p 39 * * @version $Revision: 1.1 $ $Date: 2003/02/04 04:43:58 $ * @author [EMAIL PROTECTED] * @author [EMAIL PROTECTED] * @author [EMAIL PROTECTED] */ public class Metaphone implements Encoder { private String vowels = "AEIOU" ; private String frontv = "EIY" ; private String varson = "CSPTG" ; private int maxCodeLen = 4 ; public Metaphone() { super(); } /** * Find the metaphone value of a String. This is similar to the * soundex algorithm, but better at finding similar sounding words. * All input is converted to upper case. * Limitations: Input format is expected to be a single ASCII word * with only characters in the A - Z range, no punctuation or numbers. */ public String metaphone( String txt ){ int mtsz = 0 ; boolean hard = false ; if(( txt == null ) || ( txt.length() == 0 )) return "" ; // single character is itself if( txt.length() == 1 ) return txt.toUpperCase() ; char[] inwd = txt.toUpperCase().toCharArray() ; String tmpS ; StringBuffer local = new StringBuffer( 40 ); // manipulate StringBuffer code = new StringBuffer( 10 ) ; // output // handle initial 2 characters exceptions switch( inwd[0] ){ case 'K': case 'G' : case 'P' : /* looking for KN, etc*/ if( inwd[1] == 'N')local.append(inwd, 1, inwd.length - 1 ); else local.append( inwd ); break; case 'A': /* looking for AE */ if( inwd[1] == 'E' )local.append(inwd, 1, inwd.length - 1 ); else local.append( inwd ); break; case 'W' : /* looking for WR or WH */ if( inwd[1] == 'R' ){ // WR -> R local.append(inwd, 1, inwd.length - 1 ); break ; } if( inwd[1] == 'H'){ local.append(inwd, 1, inwd.length - 1 ); local.setCharAt( 0,'W'); // WH -> W } else local.append( inwd ); break; case 'X' : /* initial X becomes S */ inwd[0] = 'S' ;local.append( inwd ); break ; default : local.append( inwd ); } // now local has working string with initials fixed int wdsz = local.length(); int n = 0 ; while((mtsz < maxCodeLen ) && // max code size of 4 works well (n < wdsz ) ){ char symb = local.charAt(n) ; // remove duplicate letters except C if(( symb != 'C' ) && (n > 0 ) && ( local.charAt(n - 1 ) == symb )) n++ ; else{ // not dup switch( symb ){ case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : if( n == 0 ) { code.append(symb );mtsz++; } break ; // only use vowel if leading char case 'B' : if( (n > 0 ) && !(n + 1 == wdsz ) && // not MB at end of word ( local.charAt(n - 1) == 'M')) { code.append(symb); } else code.append(symb); mtsz++ ; break ; case 'C' : // lots of C special cases /* discard if SCI, SCE or SCY */ if( ( n > 0 ) && ( local.charAt(n-1) == 'S' ) && ( n + 1 < wdsz ) && ( frontv.indexOf( local.charAt(n + 1)) >= 0 )){ break ;} tmpS = local.toString(); if( tmpS.indexOf("CIA", n ) == n ) { // "CIA" -> X code.append('X' ); mtsz++; break ; } if( ( n + 1 < wdsz ) && (frontv.indexOf( local.charAt(n+1) )>= 0 )){ code.append('S');mtsz++; break ; // CI,CE,CY -> S } if(( n > 0) && ( tmpS.indexOf("SCH",n-1 )== n-1 )){ // SCH->sk code.append('K') ; mtsz++;break ; } if( tmpS.indexOf("CH", n ) == n ){ // detect CH if((n == 0 ) && (wdsz >= 3 ) && // CH consonant -> K consonant (vowels.indexOf( local.charAt( 2) ) < 0 )){ code.append('K'); } else { code.append('X'); // CHvowel -> X } mtsz++; } else { code.append('K' );mtsz++; } break ; case 'D' : if(( n + 2 < wdsz )&& // DGE DGI DGY -> J ( local.charAt(n+1) == 'G' )&& (frontv.indexOf( local.charAt(n+2) )>= 0)){ code.append('J' ); n += 2 ; } else { code.append( 'T' ); } mtsz++; break ; case 'G' : // GH silent at end or before consonant if(( n + 2 == wdsz )&& (local.charAt(n+1) == 'H' )) break ; if(( n + 2 < wdsz ) && (local.charAt(n+1) == 'H' )&& (vowels.indexOf( local.charAt(n+2)) < 0 )) break ; tmpS = local.toString(); if((n > 0) && ( tmpS.indexOf("GN", n ) == n)|| ( tmpS.indexOf("GNED",n) == n )) break ; // silent G if(( n > 0 ) && (local.charAt(n-1) == 'G')) hard = true ; else hard = false ; if((n+1 < wdsz) && (frontv.indexOf( local.charAt(n+1) ) >= 0 )&& (!hard) ) code.append( 'J' ); else code.append('K'); mtsz++; break ; case 'H': if( n + 1 == wdsz ) break ; // terminal H if((n > 0) && (varson.indexOf( local.charAt(n-1)) >= 0)) break ; if( vowels.indexOf( local.charAt(n+1)) >=0 ){ code.append('H') ; mtsz++;// Hvowel } break; case 'F': case 'J' : case 'L' : case 'M': case 'N' : case 'R' : code.append( symb ); mtsz++; break ; case 'K' : if( n > 0 ){ // not initial if( local.charAt( n -1) != 'C' ) { code.append(symb ); } } else code.append( symb ); // initial K mtsz++ ; break ; case 'P' : if((n + 1 < wdsz) && // PH -> F (local.charAt( n+1) == 'H'))code.append('F'); else code.append( symb ); mtsz++; break ; case 'Q' : code.append('K' );mtsz++; break ; case 'S' : tmpS = local.toString(); if((tmpS.indexOf("SH", n )== n) || (tmpS.indexOf("SIO",n )== n) || (tmpS.indexOf("SIA",n )== n)) code.append('X'); else code.append( 'S' ); mtsz++ ; break ; case 'T' : tmpS = local.toString(); // TIA TIO -> X if((tmpS.indexOf("TIA",n )== n)|| (tmpS.indexOf("TIO",n )== n) ){ code.append('X'); mtsz++; break; } if( tmpS.indexOf("TCH",n )==n) break; // substitute numeral 0 for TH (resembles theta after all) if( tmpS.indexOf("TH", n )==n) code.append('0'); else code.append( 'T' ); mtsz++ ; break ; case 'V' : code.append('F'); mtsz++;break ; case 'W' : case 'Y' : // silent if not followed by vowel if((n+1 < wdsz) && (vowels.indexOf( local.charAt(n+1))>=0)){ code.append( symb );mtsz++; } break ; case 'X' : code.append('K'); code.append('S');mtsz += 2; break ; case 'Z' : code.append('S'); mtsz++; break ; } // end switch n++ ; } // end else from symb != 'C' if( mtsz > 4 )code.setLength( 4); } return code.toString(); } // end static method metaPhone() public String encode(String pString) { return( metaphone( pString ) ); } /** * Are the metaphones of two strings the same. */ public boolean isMetaphoneEqual(String str1, String str2) { return metaphone(str1).equals(metaphone(str2)); } /** * Returns the maxCodeLen. * @return int */ public int getMaxCodeLen() { return maxCodeLen; } /** * Sets the maxCodeLen. * @param maxCodeLen The maxCodeLen to set */ public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } } 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/RefinedSoundex.java Index: RefinedSoundex.java =================================================================== /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Commons", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package org.apache.commons.codec.language; import org.apache.commons.codec.Encoder; /** * Encodes a string into a soundex value. Sounde is an encoding used to * relate similar names, but can also be used as a general purpose * scheme to find word with similar phonemes. * More information may be found at: http://www.bluepoof.com/Soundex/info2.html * * @author [EMAIL PROTECTED] * @version $Revision: 1.1 $ $Date: 2003/02/04 04:43:59 $ */ public class RefinedSoundex implements Encoder { static public final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray(); static public final RefinedSoundex US_ENGLISH = new RefinedSoundex(); private char[] soundexMapping; public RefinedSoundex() { this(US_ENGLISH_MAPPING); } public RefinedSoundex(char[] mapping) { this.soundexMapping = mapping; } /** * Get the SoundEx value of a string. * This implementation is taken from the code-snippers on * http://www.sourceforge.net/ */ public String soundex(String str) { if(null == str || str.length() == 0) { return str; } StringBuffer sBuf = new StringBuffer(); str = str.toUpperCase(); sBuf.append( str.charAt(0) ); char last, mapped, current; last = '*'; for( int i = 0; i < str.length(); i++ ) { current = getMappingCode( str.charAt(i) ); if( current == last ) { continue; } else if( current != 0 ) { sBuf.append( current ); } last = current; } return sBuf.toString(); } public String encode(String pString) { return( soundex( pString ) ); } /** * Used internally by the SoundEx algorithm. */ private char getMappingCode(char c) { if( !Character.isLetter(c) ) { return 0; } else { return soundexMapping[Character.toUpperCase(c) - 'A']; } } } 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/Soundex.java Index: Soundex.java =================================================================== /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001-2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Commons", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package org.apache.commons.codec.language; import org.apache.commons.codec.Encoder; /** * Encodes a string into a refined soundex value. * A refined soundex code is optimized for spell checking word. * "Soundex" method originally developed by Margaret Odell and * Robert Russell * * http://www.bluepoof.com/Soundex/info2.html * * @author [EMAIL PROTECTED] * @author [EMAIL PROTECTED] * @version $Revision: 1.1 $ $Date: 2003/02/04 04:43:59 $ */ public class Soundex implements Encoder { static public final char[] US_ENGLISH_MAPPING = "01230120022455012623010202".toCharArray(); static public final Soundex US_ENGLISH = new Soundex(); private char[] soundexMapping; private int maxLength = 4; public Soundex() { this(US_ENGLISH_MAPPING); } public Soundex(char[] mapping) { this.soundexMapping = mapping; } /** * Get the SoundEx value of a string. * This implementation is taken from the code-snippers on * http://www.sourceforge.net/ */ public String soundex(String str) { if(null == str || str.length() == 0) { return str; } char out[] = { '0', '0', '0', '0' }; char last, mapped; int incount = 1, count = 1; out[0] = Character.toUpperCase( str.charAt(0) ); last = getMappingCode( str.charAt(0) ); while( (incount < str.length() ) && (mapped = getMappingCode(str.charAt(incount++))) != 0 && (count < maxLength) ) { if( (mapped != '0') && (mapped != last) ) { out[count++] = mapped; } last = mapped; } return new String(out); } public String encode(String pString) { return( soundex( pString ) ); } /** * Used internally by the SoundEx algorithm. */ private char getMappingCode(char c) { if( !Character.isLetter(c) ) { return 0; } else { return soundexMapping[Character.toUpperCase(c) - 'A']; } } /** * Returns the maxLength. Standard Soundex * @return int */ public int getMaxLength() { return maxLength; } /** * Sets the maxLength. * @param maxLength The maxLength to set */ public void setMaxLength(int maxLength) { this.maxLength = maxLength; } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]