tobrien 2003/02/03 07:00:12 Modified: codec/src/test/org/apache/commons/codec TestAll.java Added: codec TODO codec/src/java/org/apache/commons/codec/language DoubleMetaphone.java codec/src/test/org/apache/commons/codec/language TestDoubleMetaphone.java Log: Added DoubleMetaphone and associated JUnit test Revision Changes Path 1.1 jakarta-commons-sandbox/codec/TODO Index: TODO =================================================================== This is a list of action items to be finished in the [codec] project. This TODO list reflects the current direction of development, and should be updated by all committers when a known issues or task is identified. This TODO list be periodically sync'd with the content on http://nagoya.apache.org/wiki/apachewiki.cgi?CodecProjectPages - this WIKI page is provides as a tool for volunteers to comment on the current TODO list and to suggest tasks. When a task in the TODO list is done, move the entry to the DONE list below, and note who made the change and when. ** TODO List * Add a Hex implementation * Add a Rot13 implementation * Move phonetic encoders into dedicated package. * Add a Decoder interface * Refactor Base64 to implement both Encoder and Decoder * Documentation! Create Forrest documentation for Codec AFTER documentation has evolved in Wiki * Integrate Patches: ** Patch submitted by Iulian Musat for Base64 ** Add DoubleMetaphone and Nysiis implementation from KyleBurton * DoubleMetaphone ** Modify DoubleMetaphone implementation - make it thread safe(r). ** Figure out why algorithm fails to properly code "bryce" and "maurice". ** DONE 2/3/03 - TOB - Integrated DoubleMetaphone and Test from Kyle Burton 2/2/03 - TOB - "language" package created to hold language and phonetic encodings 2/2/03 - TOB - All CRLF issues resolved in codec 1/31/03 - TOB - Patch submitted fixing CRLF problems in Soundex.java 1/31/03 - TOB - Patch submitted fixing CRLF problems in RefinedSoundex.java 1.1 jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java Index: DoubleMetaphone.java =================================================================== /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001-2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Commons" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Turbine", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ package org.apache.commons.codec.language; import org.apache.commons.codec.Encoder; /** * A class to generate phonetic codings based on the double metaphone * algorithm. This module is based on example code by Ed Parrish. * * <b>PLEASE NOTE:</b> This implementation is not thread-safe. Please * see TODO list for [codec] - Tim O'Brien * * @see http://www.cse.ucsc.edu/~eparrish/toolbox/search.html * * @version $Revision: 1.1 $ * @author Ed Parish * @author <a href="[EMAIL PROTECTED]">Kyle R. Burton</a> */ public class DoubleMetaphone implements Encoder { /** The current character position in the string being encoded. */ private int current; /** The maximum size of the phonetic encoding to compute. */ private int encodeLimit = 4; /** Buffer for the primary encoding */ private StringBuffer primary = new StringBuffer(); /** Buffer for the alternate encoding */ private StringBuffer alternate = new StringBuffer(); /** Holder for the input being parsed. */ private String input = null; /** * These structures are used to make the code easier to understand, modify, * debug, and otherwise maintain. */ private final static char[] vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}; private final static char[] AEOU = {'A', 'E', 'O', 'U'}; private final static char[] AO = "AO".toCharArray(); private final static char[] BDH = {'B', 'D', 'H'}; private final static char[] BFHLMNRVW_ = "BFHLMNRVW ".toCharArray(); private final static char[] BH = {'B', 'H'}; private final static char[] BKLMNSTZ = "LTKSNMBZ".toCharArray(); private final static char[] BP = "BP".toCharArray(); private final static char[] CGQ = {'C', 'G', 'Q'}; private final static char[] CGLRT = {'C', 'G', 'L', 'R', 'T'}; private final static char[] CKQ = {'C', 'K', 'Q'}; private final static char[] CX = "CX".toCharArray(); private final static char[] DT = "DT".toCharArray(); private final static char[] EI = {'E', 'I'}; private final static char[] EIY = {'E', 'I', 'Y'}; private final static char[] EHI = {'I', 'E', 'H'}; private final static char[] KLS = "KLS".toCharArray(); private final static char[] LMNW = "LMNW".toCharArray(); private final static char[] ST = {'S', 'T'}; private final static char[] SZ = "SZ".toCharArray(); private final static String[] AggiOggi = {"AGGI", "OGGI"}; private final static String[] AiOi = {"AI", "OI"}; private final static String[] AlleIllaIllo = {"ILLO", "ILLA", "ALLE"}; private final static String[] AmOm = {"OM", "AM"}; private final static String[] AsOs = {"AS", "OS"}; private final static String[] ArchitOrchesOrchid = {"ARCHIT", "ORCHES", "ORCHID"}; private final static String[] AuOu = {"AU", "OU"}; private final static String[] BacherMacher = {"BACHER", "MACHER"}; private final static String[] CeCiCy = {"CI", "CE", "CY"}; private final static String[] CeCi = {"CE", "CI"}; private final static String[] CiaCieCio = {"CIO", "CIE", "CIA"}; private final static String[] CkCgCq = {"CK", "CG", "CQ"}; private final static String[] DangerMangerRanger = {"DANGER", "RANGER", "MANGER"}; private final static String[] DdDt = {"DD", "DT"}; private final static String[] EauIau = {"IAU", "EAU"}; private final static String[] EbEiElEpErEsEyIbIlInIe = {"ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"}; private final static String[] EdEmEnErOoUy = {"OO", "ER", "EN", "UY", "ED", "EM"}; private final static String[] EnEr = {"ER", "EN"}; private final static String[] EwskiEwskyOwskiOwsky = {"EWSKI", "EWSKY", "OWSKI", "OWSKY"}; private final static String[] GnKnPnPsWr = {"GN", "KN", "PN", "WR", "PS"}; private final static String[] HaracHaris = {"HARAC", "HARIS"}; private final static String[] HeimHoekHolmHolz = {"HEIM", "HOEK", "HOLM", "HOLZ"}; private final static String[] HemHiaHorHym = {"HOR", "HYM", "HIA", "HEM"}; private final static String[] IslYsl = {"ISL", "YSL"}; private final static String[] MaMe = {"ME", "MA"}; private final static String[] OgyRgy = {"RGY", "OGY"}; private final static String[] SiaSio = {"SIO", "SIA"}; private final static String[] TiaTch = {"TIA", "TCH"}; private final static String[] UcceeUcces = {"UCCEE", "UCCES"}; private final static String[] Van_Von_ = {"VAN ", "VON "}; private final static String[] WiczWitz = {"WICZ", "WITZ"}; private final static String[] ZaZiZo = {"ZO", "ZI", "ZA"}; /** * Default constructor. */ public DoubleMetaphone() { } /** * Parameterized constructor. */ public DoubleMetaphone( String in ) { setInput( in ); } /** * Accessor for the primary encoding. The primary encoding will not be set * until after encode is invoked with a non-null string. * @return the primary encoding. */ public String getPrimary() { return primary.toString(); } /** * Accessor for the primary encoding as a StringBuffer. * @return the string buffer for the primary encoding */ public StringBuffer getPrimaryBuffer() { return primary; } /** * Accessor for the alternate encoding. The alternate encoding will not be * set untill after encode is invoked with a non-null string. * @return the alternate encoding. */ public String getAlternate() { return alternate.toString(); } /** * Accessor for the alternate encoding as a StringBuffer. * @return the string buffer for the alternate encoding */ public StringBuffer getAlternateBuffer() { return alternate; } /** * Accessor for the maximum encoding length for both the primary and * alternate encodings. Once either encoding reaches this limit, the * encoding loop will return. * @return the encoding limit */ public int getEncodeLimit() { return encodeLimit; } /** * Accessor for the maximum encoding length for both the primary and * alternate encodings. Once either encoding reaches this limit, the * encoding loop will return. * @return the encoding limit */ public boolean setEncodeLimit(int newLimit) { if (newLimit < 1) { throw new IllegalArgumentException("Error, limit [" + newLimit + "] must be a positive integer."); } encodeLimit = newLimit; return true; } /** * Accessor for storing the input to be encoded. * @param in the input to be encoded. */ private void setInput(String in) { if (in != null) { input = in.toUpperCase() + " "; } else { input = ""; } } /** * Append a phonetic encoded character to both the primary and alternate * encodings. * @param ch the character to append. */ private void add(char ch) { add(ch, ch); } /** * Append a phonetic encoded character to both the primary and alternate * encodings. * @param primaryChar the character to append to the primary encoding. * @param alternateChar the character to append to the alternate encoding. */ private void add(char primaryChar, char alternateChar) { primary.append(primaryChar); alternate.append(alternateChar); } /** * Is the character in the input string at the given index in the list of * characters? * @param index * @param list * @return true/false */ private boolean charAt(int index, char[] list) { if (index < 0 || index >= input.length()) return false; char value = input.charAt(index); for (int i = 0; i < list.length; i++) { if (value == list[i]) return true; } return false; } /** * Is the string at the given starting index matches the given pattern. * @param start the index where to begin the comparison * @param length the number of characters to compare * @param str the pattern string to be located * @return true/false */ private boolean stringAt(int start, int length, String str) { String[] list = new String[1]; list[0] = str; return stringAt(start, length, list); } /** * Is the string at the given starting index matches any of the given pattern * strings. * @param start the index where to begin the comparison * @param length the number of characters to compare * @param list the strings to search for. * @return true/false */ private boolean stringAt(int start, int length, String[] list) { if (length <= 0) return false; for (int i = 0; i < list.length; i++) { if (input.regionMatches(start, list[i], 0, length)) return true; } return false; } /** * Test the character in the input string at index to see if it is a vowel. * @param index the location of the character to test * @return true/false */ private boolean isVowel(int index) { return charAt(index, vowels); } /** * Test the input string to see if it is likely to be categorizeable * as Slavo-Germanic in nature. This effects some of the encoding * descisions as far as the phonetic pronounciations of portions of * the name. * @return true/false */ private boolean isSlavoGermanic() { if((input.indexOf('W') > -1) || (input.indexOf('K') > -1) || (input.indexOf("CZ") > -1) || (input.indexOf("WITZ") > -1)) { return true; } return false; } /** * Append the given coding to both the primary and alternate encodings. * @param ch * @param code */ private void addCode(char ch, char code) { add(code); current++; if(input.charAt(current) == ch) current++; } /** * Static version of encode that first constructs a new DoubleMetaphone * object, and then invokes encode on it. Note that by using this method you * are sacrificing the abilty to access the alternate encoding. Also, since * this method merely creates a new DoubleMetaphone to handle the encoding, * it is effectivly thread-safe. * * This method was originaly created to allow this encoder to be used as a * Java Stored Procedure in Oracle. * * @param in the string to encode * @return the encoded string */ public static String sencode( String in ) { DoubleMetaphone dm = new DoubleMetaphone(); return dm.encode(in); } /** * Encode the given string using the Double Metaphone algorithm. Double * Metaphone produces two encodings, a primary and a secondary. The encode * method returns the primary encoding. To access the secondary encoding, * call getAlternate. * @param in the input string to encode * @return the primary encoding. */ public String encode( String in ) { setInput(in); return encode(); } /** * Encode the already set input string using the Double Metaphone algorithm. * Double Metaphone produces two encodings, a primary and a secondary. The * encode method returns the primary encoding. To access the secondary * encoding, call getAlternate. * @return the primary encoding. */ public String encode() { if (input == null) return ""; primary.delete(0, primary.length()); alternate.delete(0, alternate.length()); int length = input.length(); if (length < 1) return ""; int last = length - 1; //zero based index current = 0; //skip these when at start of word if (stringAt(0, 2, GnKnPnPsWr)) current++; //Initial 'X' is pronounced 'Z' e.g. 'Xavier' if(input.startsWith("X")) { add('S'); //'Z' maps to 'S' current++; } while (primary.length() < encodeLimit || alternate.length() < encodeLimit) { if(current >= length) break; // this is coded as a huge switch statement for performance switch(input.charAt(current)) { case 'A': case 'E': case 'I': case 'O': case 'U': case 'Y': if (current == 0) add('A'); // all init vowels map to 'A' current++; break; case 'B': // "-mb", e.g "dumb", already skipped over... addCode('B', 'P'); break; case 'Ç': add('S'); current++; // Note: no doublecheck break; case 'C': // various germanic if((current > 1) && !isVowel(current - 2) && input.regionMatches(current - 1, "ACH", 0, 3) && (input.charAt(current + 2) != 'I' && input.charAt(current + 2) != 'E' || stringAt(current - 2, 6, BacherMacher) )) { add('K'); current +=2; break; } // special case 'caesar' if (current == 0 && input.regionMatches(current, "CAESAR", 0, 6)) { add('S'); current +=2; break; } //italian 'chianti' if (input.regionMatches(current, "CHIA", 0, 4)) { add('K'); current +=2; break; } if (input.regionMatches(current, "CH", 0, 2)) { //find 'michael' if(current > 0 && input.regionMatches(current, "CHAE", 0, 4)) { add('K', 'X'); current +=2; break; } // greek roots e.g. 'chemistry', 'chorus' if (current == 0 && (stringAt(current + 1, 5, HaracHaris) || stringAt((current + 1), 3, HemHiaHorHym)) && !input.regionMatches(0, "CHORE", 0, 5)) { add('K'); current +=2; break; } // germanic, greek, or otherwise 'ch' for 'kh' sound if ((stringAt(0, 4, Van_Von_) || input.regionMatches(0, "SCH ", 0, 3)) // 'architect' but not 'arch', 'orchestra', 'orchid' || stringAt(0, 6, ArchitOrchesOrchid) || charAt(current + 2, ST) || ((charAt(current - 1, AEOU) || current == 0) // e.g. 'wachtler', 'wechsler', but not 'tichner' && charAt(current + 2, BFHLMNRVW_))) { add('K'); } else { if (current > 0) { if (input.regionMatches(0, "MC", 0, 2)) { // e.g. "McHugh" add('K'); } else { add('X', 'K'); } } else { add('X'); } } current +=2; break; } // e.g. 'czerny' if (input.regionMatches(current, "CZ", 0, 2) && !input.regionMatches(current - 2, "WICZ", 0, 4)) { add('S', 'X'); current += 2; break; } // e.g. 'focaccia' if (input.regionMatches(current + 1, "CIA", 0, 3)) { add('X'); current += 3; break; } // double 'C', but not if e.g. 'McClellan' if (input.regionMatches(current, "CC", 0, 2) && !((current == 1) && (input.charAt(0) == 'M'))) { // 'bellocchio' but not 'bacchus' if (charAt(current + 2, EHI) && !input.regionMatches(current + 2, "HU", 0, 2)) { // 'accident', 'accede' 'succeed' if(((current == 1) && (input.charAt(current - 1) == 'A')) || stringAt(current - 1, 5, UcceeUcces)) { add('K'); add('S'); } else { // 'bacci', 'bertucci', other italian add('X'); } current += 3; break; } else { // Pierce's rule add('K'); current += 2; break; } } if (stringAt(0, 2, CkCgCq)) { add('K'); current += 2; break; } if (stringAt(0, 2, CeCiCy)) { // italian vs. english if (stringAt(0, 3, CiaCieCio)) { add('S', 'X'); } else { add('S'); } current += 2; break; } // else add('K'); // name sent in 'mac caffrey', 'mac gregor' if (charAt(current + 1, CGQ)) { current += 3; } else { if (charAt(current + 1, CKQ) && !stringAt(current + 1, 2, CeCi)) { current += 2; } else { current++; } } break; case 'D': if(input.regionMatches(current, "DG", 0, 2)) { if (charAt(current + 2, EIY)) { //e.g. 'edge' add('J'); current += 3; break; } else { //e.g. 'edgar' add('T'); add('K'); current += 2; break; } } if (stringAt(current, 2, DdDt)) { add('T'); current += 2; break; } //else add('T'); current++; break; case 'F': // NTR: this is typical default behavior addCode('F', 'F'); break; case 'G': if (input.charAt(current + 1) == 'H') { if (current > 0 && !isVowel(current - 1)) { add('K'); current += 2; break; } if (current < 3) { // 'ghislane', 'ghiradelli' if (current == 0) { if (input.charAt(current + 2) == 'I') { add('J'); } else { add('K'); } current += 2; break; } } //Parker's rule (with some further refinements) - e.g., 'hugh' if((current > 1 && charAt(current - 2, BDH)) //e.g., 'bough' || (current > 2 && charAt(current - 3, BDH )) //e.g., 'broughton' || (current > 3 && charAt(current - 4, BH)) ) { current += 2; break; } else { //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if (current > 2 && input.charAt(current - 1) == 'U' && charAt(current - 3, CGLRT) ) { add('F'); } else { if (current > 0 && input.charAt(current - 1) != 'I') { add('K'); } } current += 2; break; } } boolean slavoGermanic = isSlavoGermanic(); if (input.charAt(current + 1) == 'N') { if (current == 1 && isVowel(0) && !slavoGermanic) { primary.append('K'); add('N'); } else { //not e.g. 'cagney' if (!input.regionMatches(current + 2, "EY", 0, 2) && (input.charAt(current + 1) != 'Y') && !slavoGermanic) { alternate.append('K'); add('N'); } else { add('K'); add('N'); } current += 2; break; } } //'tagliaro' if (input.regionMatches(current + 1, "LI", 0, 2) && !slavoGermanic) { primary.append('K'); add('L'); current += 2; break; } //-ges-,-gep-,-gel-, -gie- at beginning if((current == 0) && (input.charAt(current + 1) == 'Y' || stringAt(current + 1, 2, EbEiElEpErEsEyIbIlInIe)) ) { add('K', 'J'); current += 2; break; } // -ger-, -gy- if ((input.regionMatches(current + 1, "ER", 0, 2) || input.charAt(current + 1) == 'Y') && !stringAt(0, 6, DangerMangerRanger) && !charAt(current - 1, EI) && !stringAt(current - 1, 3, OgyRgy) ) { add('K', 'J'); current += 2; break; } // italian e.g, 'biaggi' if (charAt(current + 1, EIY) || stringAt(current - 1, 4, AggiOggi)) { //obvious germanic if ((stringAt(0, 4, Van_Von_) || input.regionMatches(0, "SCH", 0, 3)) || input.regionMatches(current + 1, "ET", 0, 2)) { add('K'); } else { //always soft if french ending if (input.regionMatches(current + 1, "IER ", 0, 4)) { add('J'); } else { add('J', 'K'); } current += 2; break; } } if (input.charAt(current + 1) == 'G') { current += 2; } else { current++; } add('K'); break; case 'H': // only keep if first & before vowel or btw. 2 vowels if ((current == 0 || isVowel(current - 1)) && isVowel(current + 1)) { add('H'); current += 2; } else { // also takes care of 'HH' current++; } break; case 'J': //obvious spanish, 'jose', 'san jacinto' if (stringAt(current, 4, "JOSE") || stringAt(0, 4, "SAN ")) { if ((current == 0 && (input.charAt(current + 4) == ' ')) || stringAt(0, 4, "SAN ")) { add('H'); } else { add('J', 'H'); } current +=1; break; } if (current == 0 && !stringAt(current, 4, "JOSE")) { add('J', 'A'); // Yankelovich/Jankelowicz } else { // spanish pron. of e.g. 'bajador' if (isVowel(current - 1) && !isSlavoGermanic() && ((input.charAt(current + 1) == 'A') || (input.charAt(current + 1) == 'O'))) { add('J', 'H'); } else { if (current == last) { add('J', ' '); } else { if (!charAt(current + 1, BKLMNSTZ) && !charAt(current - 1, KLS)) { add('J'); } } } } current++; if(input.charAt(current) == 'J') current++; // doublecheck break; case 'K': // NTR: this is typical default behavior addCode('K', 'K'); break; case 'L': if (input.charAt(current + 1) == 'L') { //spanish e.g. 'cabrillo', 'gallegos' if (((current == (length - 3)) && stringAt(current - 1, 4, AlleIllaIllo)) || ((stringAt((last - 1), 2, AsOs) || charAt(last, AO)) && stringAt(current - 1, 4, "ALLE")) ) { primary.append('L'); current += 2; break; } current += 2; } else { current++; } add('L'); break; case 'M': if ((stringAt(current - 1, 3, "UMB") && (((current + 1) == last) || stringAt(current + 2, 2, "ER"))) //'dumb','thumb' || (input.charAt(current + 1) == 'M') ) { current += 2; } else { current++; } add('M'); break; case 'N': // NTR: this is typical default behavior addCode('N', 'N'); break; case 'Ñ': current++; add('N'); break; case 'P': if (input.charAt(current + 1) == 'H') { add('F'); current += 2; break; } //also account for 'campbell', 'raspberry' if (charAt(current + 1, BP)) current += 2; else current++; add('P'); break; case 'Q': // NTR: this is typical default behavior addCode('Q', 'K'); break; case 'R': //french e.g. 'rogier', but exclude 'hochmeier' if ((current == last) && !isSlavoGermanic() && stringAt(current - 2, 2, "IE") && !stringAt(current - 4, 2, MaMe)) { alternate.append('R'); } else { add('R'); } current++; if(input.charAt(current) == 'R') current++; // doublecheck break; case 'S': //special cases 'island', 'isle', 'carlisle', 'carlysle' if (stringAt(current - 1, 3, IslYsl)) { current++; break; } //special case 'sugar-' if ((current == 0) && stringAt(current, 5, "SUGAR")) { add('X', 'S'); current++; break; } if (stringAt(current, 2, "SH")) { //germanic if (stringAt(current + 1, 4, HeimHoekHolmHolz)) { add('S'); } else { add('X'); } current += 2; break; } //italian & armenian if (stringAt(current, 3, SiaSio) || stringAt(current, 4, "SIAN")) { if (!isSlavoGermanic()) { add('S', 'X'); } else { add('S'); } current += 3; break; } //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' //also, -sz- in slavic language altho in hungarian it is pronounced 's' if ((current == 0 && charAt(current + 1, LMNW)) || input.charAt(current + 1) == 'Z') { add('S', 'X'); if (input.charAt(current + 1) == 'Z') { current += 2; } else { current++; } break; } if (stringAt(current, 2, "SC")) { //Schlesinger's rule if (input.charAt(current + 2) == 'H') { //dutch origin, e.g. 'school', 'schooner' if (stringAt(current + 3, 2, EdEmEnErOoUy)) { //'schermerhorn', 'schenker' if (stringAt((current + 3), 2, EnEr)) { add('X', 'S'); alternate.append('K'); } else { add('S'); add('K'); } current += 3; break; } else { if (current == 0 && !isVowel(3) && input.charAt(3) != 'W') { add('X', 'S'); } else { add('X'); } current += 3; break; } } if (charAt(current + 2, EIY)) { add('S'); current += 3; break; } //else add('S'); add('K'); current += 3; break; } //french e.g. 'resnais', 'artois' if (current == last && stringAt(current - 2, 2, AiOi)) { alternate.append('S'); } else { add('S'); } if (charAt(current + 1, SZ)) { current += 2; } else { current++; } break; case 'T': if (stringAt(current, 4, "TION")) { add('X'); current += 3; break; } if (stringAt(current, 3, TiaTch)) { add('X'); current += 3; break; } if (stringAt(current, 2, "TH") || stringAt(current, 3, "TTH")) { //special case 'thomas', 'thames' or germanic if (stringAt(current + 2, 2, AmOm) || stringAt(0, 4, Van_Von_) || stringAt(0, 3, "SCH")) { add('T'); } else { add('0', 'T'); } current += 2; break; } if (charAt(current + 1, DT)) current += 2; else current++; add('T'); break; case 'V': // NTR: this is typical default behavior addCode('V', 'F'); break; case 'W': //can also be in middle of word if (stringAt(current, 2, "WR")) { add('R'); current += 2; break; } if (current == 0 && (isVowel(current + 1) || stringAt(current, 2, "WH"))) { //Wasserman should match Vasserman if (isVowel(current + 1)) { add('A', 'F'); } else { //need 'Uomo' to match 'Womo' add('A'); } } //'Arnow' should match 'Arnoff' if ((current == last && isVowel(current - 1)) || stringAt(current - 1, 5, EwskiEwskyOwskiOwsky) || stringAt(0, 3, "SCH")) { alternate.append('F'); current +=1; break; } //polish e.g. 'filipowicz' if (stringAt(current, 4, WiczWitz)) { add('T', 'F'); add('S', 'X'); current +=4; break; } //else skip it current +=1; break; case 'X': //french e.g. breaux if (!(current == last && (stringAt((current - 3), 3, EauIau) || stringAt((current - 2), 2, AuOu))) ) { add('K'); add('S'); } if (charAt(current + 1, CX)) { current += 2; } else { current++; } break; case 'Z': //chinese pinyin e.g. 'zhao' if (input.charAt(current + 1) == 'H') { add('J'); current += 2; break; } else { if (stringAt(current + 1, 2, ZaZiZo) || (isSlavoGermanic() && (current > 0 && input.charAt(current - 1) != 'T'))) { alternate.append('T'); add('S'); } else { add('S'); } } if (input.charAt(current + 1) == 'Z') { current += 2; } else { current++; } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': add(input.charAt(current)); current++; break; default: current++; } // switch } // while // Only give back the specified length if (primary.length() > encodeLimit) { primary.delete(encodeLimit, primary.length()); } if (alternate.length() > encodeLimit) { alternate.delete(encodeLimit, alternate.length()); } return primary.toString(); } /** * Check if the two strings encode to the same primary or alternate encodings * using the Double Metaphone algorithm. * @param s1 * @param s2 * @return true/false */ public static boolean isEncodeEqual( String s1, String s2 ) { DoubleMetaphone dm1 = new DoubleMetaphone( s1 ); DoubleMetaphone dm2 = new DoubleMetaphone( s2 ); dm1.encode(); dm2.encode(); return dm1.getPrimary().equals( dm2.getPrimary() ) || dm1.getPrimary().equals( dm2.getAlternate() ) || dm1.getAlternate().equals( dm2.getPrimary() ); } } 1.3 +7 -4 jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java Index: TestAll.java =================================================================== RCS file: /home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- TestAll.java 18 Nov 2002 13:00:26 -0000 1.2 +++ TestAll.java 3 Feb 2003 15:00:12 -0000 1.3 @@ -61,6 +61,8 @@ package org.apache.commons.codec; +import org.apache.commons.codec.language.TestDoubleMetaphone; + import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; @@ -81,6 +83,7 @@ suite.addTest(TestMetaphone.suite()); suite.addTest(TestSoundex.suite()); suite.addTest(TestRefinedSoundex.suite()); + suite.addTest(TestDoubleMetaphone.suite()); return suite; } 1.1 jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java Index: TestDoubleMetaphone.java =================================================================== /* * $Header: /home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java,v 1.1 2003/02/03 15:00:12 tobrien Exp $ * $Revision: 1.1 $ * $Date: 2003/02/03 15:00:12 $ * * ==================================================================== * * The Apache Software License, Version 1.1 * * Copyright (c) 2002 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, if * any, must include the following acknowlegement: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowlegement may appear in the software itself, * if and wherever such third-party acknowlegements normally appear. * * 4. The names "The Jakarta Project", "Commons", and "Apache Software * Foundation" must not be used to endorse or promote products derived * from this software without prior written permission. For written * permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache" * nor may "Apache" appear in their names without prior written * permission of the Apache Group. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. * */ package org.apache.commons.codec.language; import org.apache.commons.codec.Encoder; import org.apache.commons.codec.TestEncoder; import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; /** * @version $Revision: 1.1 $ $Date: 2003/02/03 15:00:12 $ * @author <a href="[EMAIL PROTECTED]">Kyle R. Burton</a> */ public class TestDoubleMetaphone extends TestEncoder { public TestDoubleMetaphone(String name) { super(name); } public static Test suite() { return (new TestSuite(TestDoubleMetaphone.class)); } public void setUp() throws Exception { super.setUp(); _encoder = new DoubleMetaphone(); } public void tearDown() throws Exception { super.tearDown(); _encoder = null; } protected Encoder makeEncoder() { return new DoubleMetaphone(); } // ------------------------------------------------------------------------ public void testDoubleMetaphone() { for(int i = 0; i < words.length; ++i) { assertEquals( "encoding: " + words[i], encodings[i], _encoder.encode(words[i]) ); } } public void testIsDoubleMetaphoneEqual() { // need good examples of when two strings should encode to // the same values... } private DoubleMetaphone _encoder = null; // These tests were taken from the Text::DoubleMetaphone // Perl module available from CPAN private String [] words = { //"maurice", "aubrey", "cambrillo", "heidi", "katherine", "catherine", "richard", "bob", "eric", "geoff", "dave", "ray", "steven", //"bryce", "randy", "bryan", "brian", "otto", "auto", }; private String [] encodings = { // "MRS", "APR", "KMPR", "HT", "K0RN", "K0RN", "RXRT", "PP", "ARK", "JF", "TF", "R", "STFN", //"PRS", "RNT", "PRN", "PRN", "AT", "AT", }; }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]