language TestDoubleMetaphone.java

tobrien Mon, 03 Feb 2003 06:59:46 -0800

tobrien     2003/02/03 07:00:12

  Modified:    codec/src/test/org/apache/commons/codec TestAll.java
  Added:       codec    TODO
               codec/src/java/org/apache/commons/codec/language
                        DoubleMetaphone.java
               codec/src/test/org/apache/commons/codec/language
                        TestDoubleMetaphone.java
  Log:
  Added DoubleMetaphone and associated JUnit test
  
  Revision  Changes    Path
  1.1                  jakarta-commons-sandbox/codec/TODO
  
  Index: TODO
  ===================================================================
  This is a list of action items to be finished in the [codec] project.  
  This TODO list reflects the current direction of development, and
  should be updated by all committers when a known issues or task
  is identified.  
  
  This TODO list be periodically sync'd with the content on 
  http://nagoya.apache.org/wiki/apachewiki.cgi?CodecProjectPages - this
  WIKI page is provides as a tool for volunteers to comment on the 
  current TODO list and to suggest tasks.
  
  When a task in the TODO list is done, move the entry to the DONE list
  below, and note who made the change and when.  
  
  ** TODO List
  
  * Add a Hex implementation 
  
  * Add a Rot13 implementation 
  
  * Move phonetic encoders into dedicated package. 
  
  * Add a Decoder interface 
  
  * Refactor Base64 to implement both Encoder and Decoder 
  
  * Documentation! Create Forrest documentation for Codec AFTER documentation has 
evolved in Wiki 
  
  * Integrate Patches: 
  ** Patch submitted by Iulian Musat for Base64 
  ** Add DoubleMetaphone and Nysiis implementation from KyleBurton 
  
  * DoubleMetaphone
  ** Modify DoubleMetaphone implementation - make it thread safe(r).
  ** Figure out why algorithm fails to properly code "bryce" and "maurice".
  
  ** DONE 
  
  2/3/03 - TOB - Integrated DoubleMetaphone and Test from Kyle Burton
  2/2/03 - TOB - "language" package created to hold language and phonetic encodings 
  2/2/03 - TOB - All CRLF issues resolved in codec 
  1/31/03 - TOB - Patch submitted fixing CRLF problems in Soundex.java 
  1/31/03 - TOB - Patch submitted fixing CRLF problems in RefinedSoundex.java 
  
  
  1.1                  
jakarta-commons-sandbox/codec/src/java/org/apache/commons/codec/language/DoubleMetaphone.java
  
  Index: DoubleMetaphone.java
  ===================================================================
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001-2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Commons" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Turbine", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  package org.apache.commons.codec.language;
  
  import org.apache.commons.codec.Encoder;
  
  /**
   * A class to generate phonetic codings based on the double metaphone
   * algorithm.  This module is based on example code by Ed Parrish.
   *
   * <b>PLEASE NOTE:</b> This implementation is not thread-safe.  Please 
   * see TODO list for [codec] - Tim O'Brien
   *
   * @see http://www.cse.ucsc.edu/~eparrish/toolbox/search.html
   *
   * @version $Revision: 1.1 $
   * @author Ed Parish
   * @author <a href="[EMAIL PROTECTED]">Kyle R. Burton</a> 
   */
  public class DoubleMetaphone implements Encoder {
  
    /** The current character position in the string being encoded. */
    private int current;
  
    /** The maximum size of the phonetic encoding to compute. */
    private int encodeLimit = 4;
  
    /** Buffer for the primary encoding */
    private StringBuffer primary = new StringBuffer();
  
    /** Buffer for the alternate encoding */
    private StringBuffer alternate = new StringBuffer();
  
    /** Holder for the input being parsed. */
    private String input = null;
  
    /** 
     * These structures are used to make the code easier to understand, modify,
     * debug, and otherwise maintain.
     */
    private final static char[] vowels = {'A', 'E', 'I', 'O', 'U', 'Y'};
    private final static char[] AEOU = {'A', 'E', 'O', 'U'};
    private final static char[] AO = "AO".toCharArray();
    private final static char[] BDH = {'B', 'D', 'H'};
    private final static char[] BFHLMNRVW_ = "BFHLMNRVW ".toCharArray();
    private final static char[] BH = {'B', 'H'};
    private final static char[] BKLMNSTZ = "LTKSNMBZ".toCharArray();
    private final static char[] BP = "BP".toCharArray();
    private final static char[] CGQ = {'C', 'G', 'Q'};
    private final static char[] CGLRT = {'C', 'G', 'L', 'R', 'T'};
    private final static char[] CKQ = {'C', 'K', 'Q'};
    private final static char[] CX = "CX".toCharArray();
    private final static char[] DT = "DT".toCharArray();
    private final static char[] EI = {'E', 'I'};
    private final static char[] EIY = {'E', 'I', 'Y'};
    private final static char[] EHI = {'I', 'E', 'H'};
    private final static char[] KLS = "KLS".toCharArray();
    private final static char[] LMNW = "LMNW".toCharArray();
    private final static char[] ST = {'S', 'T'};
    private final static char[] SZ = "SZ".toCharArray();
    private final static String[] AggiOggi = {"AGGI", "OGGI"};
    private final static String[] AiOi = {"AI", "OI"};
    private final static String[] AlleIllaIllo = {"ILLO", "ILLA", "ALLE"};
    private final static String[] AmOm = {"OM", "AM"};
    private final static String[] AsOs = {"AS", "OS"};
    private final static String[] ArchitOrchesOrchid = {"ARCHIT", "ORCHES", "ORCHID"};
    private final static String[] AuOu = {"AU", "OU"};
    private final static String[] BacherMacher = {"BACHER", "MACHER"};
    private final static String[] CeCiCy = {"CI", "CE", "CY"};
    private final static String[] CeCi = {"CE", "CI"};
    private final static String[] CiaCieCio = {"CIO", "CIE", "CIA"};
    private final static String[] CkCgCq = {"CK", "CG", "CQ"};
    private final static String[] DangerMangerRanger = {"DANGER", "RANGER", "MANGER"};
    private final static String[] DdDt = {"DD", "DT"};
    private final static String[] EauIau = {"IAU", "EAU"};
    private final static String[] EbEiElEpErEsEyIbIlInIe = {"ES", "EP", "EB", "EL", 
"EY", "IB", "IL", "IN", "IE", "EI", "ER"};
    private final static String[] EdEmEnErOoUy = {"OO", "ER", "EN", "UY", "ED", "EM"};
    private final static String[] EnEr = {"ER", "EN"};
    private final static String[] EwskiEwskyOwskiOwsky = {"EWSKI", "EWSKY", "OWSKI", 
"OWSKY"};
    private final static String[] GnKnPnPsWr = {"GN", "KN", "PN", "WR", "PS"};
    private final static String[] HaracHaris = {"HARAC", "HARIS"};
    private final static String[] HeimHoekHolmHolz = {"HEIM", "HOEK", "HOLM", "HOLZ"};
    private final static String[] HemHiaHorHym = {"HOR", "HYM", "HIA", "HEM"};
    private final static String[] IslYsl = {"ISL", "YSL"};
    private final static String[] MaMe = {"ME", "MA"};
    private final static String[] OgyRgy = {"RGY", "OGY"};
    private final static String[] SiaSio = {"SIO", "SIA"};
    private final static String[] TiaTch = {"TIA", "TCH"};
    private final static String[] UcceeUcces = {"UCCEE", "UCCES"};
    private final static String[] Van_Von_ = {"VAN ", "VON "};
    private final static String[] WiczWitz = {"WICZ", "WITZ"};
    private final static String[] ZaZiZo = {"ZO", "ZI", "ZA"};
  
    /** 
     * Default constructor. 
     */
    public DoubleMetaphone() {
    }
  
    /** 
     * Parameterized constructor. 
     */
    public DoubleMetaphone( String in ) {
      setInput( in );
    }
  
    /**
     * Accessor for the primary encoding.  The primary encoding will not be set
     * until after encode is invoked with a non-null string.
     * @return the primary encoding.
     */
    public String getPrimary() {
      return primary.toString();
    }
  
    /**
     * Accessor for the primary encoding as a StringBuffer.
     * @return the string buffer for the primary encoding
     */
    public StringBuffer getPrimaryBuffer() {
      return primary;
    }
  
    /**
     * Accessor for the alternate encoding.  The alternate encoding will not be
     * set untill after encode is invoked with a non-null string.
     * @return the alternate encoding.
     */
    public String getAlternate() {
      return alternate.toString();
    }
  
    /**
     * Accessor for the alternate encoding as a StringBuffer.
     * @return the string buffer for the alternate encoding
     */
    public StringBuffer getAlternateBuffer() {
      return alternate;
    }
  
    /**
     * Accessor for the maximum encoding length for both the primary and
     * alternate encodings.  Once either encoding reaches this limit, the
     * encoding loop will return.
     * @return the encoding limit
     */
    public int getEncodeLimit() {
      return encodeLimit;
    }
  
    /**
     * Accessor for the maximum encoding length for both the primary and
     * alternate encodings.  Once either encoding reaches this limit, the
     * encoding loop will return.
     * @return the encoding limit
     */
    public boolean setEncodeLimit(int newLimit) {
      if (newLimit < 1) {
        throw new IllegalArgumentException("Error, limit [" + newLimit + "] must be a 
positive integer.");
      }
  
      encodeLimit = newLimit;
      return true;
    }
  
    /**
     * Accessor for storing the input to be encoded.
     * @param in the input to be encoded.
     */
    private void setInput(String in) {
      if (in != null) {
        input = in.toUpperCase() + "     ";
      } else {
        input = "";
      }
    }
  
    /**
     * Append a phonetic encoded character to both the primary and alternate
     * encodings.
     * @param ch the character to append.
     */
    private void add(char ch) {
      add(ch, ch);
    }
  
    /**
     * Append a phonetic encoded character to both the primary and alternate
     * encodings.
     * @param primaryChar the character to append to the primary encoding.
     * @param alternateChar the character to append to the alternate encoding.
     */ 
    private void add(char primaryChar, char alternateChar) {
      primary.append(primaryChar);
      alternate.append(alternateChar);
    }
  
    /**
     * Is the character in the input string at the given index in the list of
     * characters?
     * @param index
     * @param list
     * @return true/false
     */
    private boolean charAt(int index, char[] list) {
      if (index < 0 || index >= input.length()) return false;
      char value = input.charAt(index);
      for (int i = 0; i < list.length; i++) {
        if (value == list[i]) return true;
      }
      return false;
    }
  
    /**
     * Is the string at the given starting index matches the given pattern.
     * @param start the index where to begin the comparison
     * @param length the number of characters to compare
     * @param str the pattern string to be located
     * @return true/false
     */
    private boolean stringAt(int start, int length, String str) {
      String[] list = new String[1];
      list[0] = str;
      return stringAt(start, length, list);
    }
  
    /**
     * Is the string at the given starting index matches any of the given pattern
     * strings.
     * @param start the index where to begin the comparison
     * @param length the number of characters to compare
     * @param list the strings to search for.
     * @return true/false
     */
    private boolean stringAt(int start, int length, String[] list) {
      if (length <= 0) return false;
      for (int i = 0; i < list.length; i++) {
        if (input.regionMatches(start, list[i], 0, length)) return true;
      }
      return false;
    }
  
    /**
     * Test the character in the input string at index to see if it is a vowel.
     * @param index the location of the character to test
     * @return true/false
     */
    private boolean isVowel(int index) {
      return charAt(index, vowels);
    }
  
    /**
     * Test the input string to see if it is likely to be categorizeable
     * as Slavo-Germanic in nature.  This effects some of the encoding
     * descisions as far as the phonetic pronounciations of portions of
     * the name.
     * @return true/false 
     */
    private boolean isSlavoGermanic() {
      if((input.indexOf('W') > -1) || (input.indexOf('K') > -1)
            || (input.indexOf("CZ") > -1) || (input.indexOf("WITZ") > -1)) {
        return true;
      }
      return false;
    }
  
    /**
     * Append the given coding to both the primary and alternate encodings.
     * @param ch
     * @param code
     */
    private void addCode(char ch, char code) {
      add(code);
      current++;
      if(input.charAt(current) == ch) current++;
    }
  
    /**
     * Static version of encode that first constructs a new DoubleMetaphone
     * object, and then invokes encode on it.  Note that by using this method you
     * are sacrificing the abilty to access the alternate encoding.  Also, since
     * this method merely creates a new DoubleMetaphone to handle the encoding,
     * it is effectivly thread-safe.
     *
     * This method was originaly created to allow this encoder to be used as a
     * Java Stored Procedure in Oracle.
     *
     
     * @param in the string to encode
     * @return the encoded string
     */ 
    public static String sencode( String in ) {
      DoubleMetaphone dm = new DoubleMetaphone();
      return dm.encode(in);
    }
  
    /**
     * Encode the given string using the Double Metaphone algorithm.  Double
     * Metaphone produces two encodings, a primary and a secondary.  The encode
     * method returns the primary encoding.  To access the secondary encoding,
     * call getAlternate. 
     * @param in the input string to encode
     * @return the primary encoding.
     */
    public String encode( String in ) {
      setInput(in);
      return encode();
    }
  
    /**
     * Encode the already set input string using the Double Metaphone algorithm.
     * Double Metaphone produces two encodings, a primary and a secondary.  The
     * encode method returns the primary encoding.  To access the secondary
     * encoding, call getAlternate. 
     * @return the primary encoding.
     */
    public String encode() {
      if (input == null) return "";
      primary.delete(0, primary.length());
      alternate.delete(0, alternate.length());
      int length = input.length();
      if (length < 1) return "";
      int last = length - 1; //zero based index
      current = 0;
  
      //skip these when at start of word
      if (stringAt(0, 2, GnKnPnPsWr)) current++;
  
      //Initial 'X' is pronounced 'Z' e.g. 'Xavier'
      if(input.startsWith("X")) {
        add('S');  //'Z' maps to 'S'
        current++;
      }
  
      while (primary.length() < encodeLimit || alternate.length() < encodeLimit) {
          if(current >= length) break;
  
          // this is coded as a huge switch statement for performance
          switch(input.charAt(current)) {
              case 'A':
              case 'E':
              case 'I':
              case 'O':
              case 'U':
              case 'Y':
                  if (current == 0) add('A'); // all init vowels map to 'A'
                  current++;
                  break;
  
              case 'B':
                  // "-mb", e.g "dumb", already skipped over...
                  addCode('B', 'P');
                  break;
  
              case '�':
                  add('S');
                  current++;
                  // Note: no doublecheck
                  break;
  
              case 'C':
                  // various germanic
                  if((current > 1) && !isVowel(current - 2)
                          && input.regionMatches(current - 1, "ACH", 0, 3)
                          && (input.charAt(current + 2) != 'I'
                          && input.charAt(current + 2) != 'E'
                          || stringAt(current - 2, 6, BacherMacher) )) {
                      add('K');
                      current +=2;
                      break;
                  }
  
                  // special case 'caesar'
                  if (current == 0
                          && input.regionMatches(current, "CAESAR", 0, 6)) {
                      add('S');
                      current +=2;
                      break;
                  }
  
                  //italian 'chianti'
                  if (input.regionMatches(current, "CHIA", 0, 4)) {
                      add('K');
                      current +=2;
                      break;
                  }
  
                  if (input.regionMatches(current, "CH", 0, 2)) {
                      //find 'michael'
                      if(current > 0
                              && input.regionMatches(current, "CHAE", 0, 4)) {
                          add('K', 'X');
                          current +=2;
                          break;
                      }
  
                      // greek roots e.g. 'chemistry', 'chorus'
                      if (current == 0
                              && (stringAt(current + 1, 5, HaracHaris)
                              || stringAt((current + 1), 3, HemHiaHorHym))
                              && !input.regionMatches(0, "CHORE", 0, 5)) {
                          add('K');
                          current +=2;
                          break;
                      }
  
                      // germanic, greek, or otherwise 'ch' for 'kh' sound
                      if ((stringAt(0, 4, Van_Von_)
                              || input.regionMatches(0, "SCH ", 0, 3))
                              // 'architect' but not 'arch', 'orchestra', 'orchid'
                              || stringAt(0, 6, ArchitOrchesOrchid)
                              || charAt(current + 2, ST)
                              || ((charAt(current - 1, AEOU)
                              || current == 0)
                              // e.g. 'wachtler', 'wechsler', but not 'tichner'
                              && charAt(current + 2, BFHLMNRVW_))) {
                           add('K');
                      } else {
                          if (current > 0) {
                              if (input.regionMatches(0, "MC", 0, 2)) {
                                  // e.g. "McHugh"
                                  add('K');
                              } else {
                                  add('X', 'K');
                              }
                          } else {
                              add('X');
                          }
                       }
                       current +=2;
                       break;
                  }
  
                  // e.g. 'czerny'
                  if (input.regionMatches(current, "CZ", 0, 2)
                          && !input.regionMatches(current - 2, "WICZ", 0, 4)) {
                      add('S', 'X');
                      current += 2;
                      break;
                  }
  
                  // e.g. 'focaccia'
                  if (input.regionMatches(current + 1, "CIA", 0, 3)) {
                      add('X');
                      current += 3;
                      break;
                  }
  
                  // double 'C', but not if e.g. 'McClellan'
                  if (input.regionMatches(current, "CC", 0, 2)
                          && !((current == 1) && (input.charAt(0) == 'M'))) {
                      // 'bellocchio' but not 'bacchus'
                      if (charAt(current + 2, EHI)
                              && !input.regionMatches(current + 2, "HU", 0, 2)) {
                          // 'accident', 'accede' 'succeed'
                          if(((current == 1) && (input.charAt(current - 1) == 'A'))
                                  || stringAt(current - 1, 5, UcceeUcces)) {
                              add('K');
                              add('S');
                          } else { // 'bacci', 'bertucci', other italian
                              add('X');
                          }
                          current += 3;
                          break;
                      } else { // Pierce's rule
                          add('K');
                          current += 2;
                          break;
                      }
                  }
  
                  if (stringAt(0, 2, CkCgCq)) {
                      add('K');
                      current += 2;
                      break;
                  }
  
                  if (stringAt(0, 2, CeCiCy)) {
                      // italian vs. english
                      if (stringAt(0, 3, CiaCieCio)) {
                          add('S', 'X');
                      } else {
                          add('S');
                      }
                      current += 2;
                      break;
                  }
  
                  // else
                  add('K');
  
                  // name sent in 'mac caffrey', 'mac gregor'
                  if (charAt(current + 1, CGQ)) {
                      current += 3;
                  } else {
                      if (charAt(current + 1, CKQ)
                              && !stringAt(current + 1, 2, CeCi)) {
                          current += 2;
                      } else {
                          current++;
                      }
                  }
                  break;
  
              case 'D':
                  if(input.regionMatches(current, "DG", 0, 2)) {
                      if (charAt(current + 2, EIY)) {
                          //e.g. 'edge'
                          add('J');
                          current += 3;
                          break;
                      } else {
                          //e.g. 'edgar'
                          add('T');
                          add('K');
                          current += 2;
                          break;
                      }
                  }
  
                  if (stringAt(current, 2, DdDt)) {
                      add('T');
                      current += 2;
                      break;
                  }
  
                  //else
                  add('T');
                  current++;
                  break;
  
              case 'F':  // NTR: this is typical default behavior
                  addCode('F', 'F');
                  break;
  
              case 'G':
                  if (input.charAt(current + 1) == 'H') {
                      if (current > 0 && !isVowel(current - 1)) {
                          add('K');
                          current += 2;
                          break;
                      }
  
                      if (current < 3) {
                          // 'ghislane', 'ghiradelli'
                          if (current == 0) {
                              if (input.charAt(current + 2) == 'I') {
                                  add('J');
                              } else {
                                  add('K');
                              }
                              current += 2;
                              break;
                          }
                      }
                      //Parker's rule (with some further refinements) - e.g., 'hugh'
                      if((current > 1 && charAt(current - 2, BDH))
                              //e.g., 'bough'
                              || (current > 2 && charAt(current - 3, BDH ))
                              //e.g., 'broughton'
                              || (current > 3 && charAt(current - 4, BH)) ) {
                          current += 2;
                          break;
                      } else {
                          //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 
'tough'
                          if (current > 2 && input.charAt(current - 1) == 'U'
                                  && charAt(current - 3, CGLRT) ) {
                              add('F');
                          } else {
                              if (current > 0 && input.charAt(current - 1) != 'I') {
                                  add('K');
                              }
                          }
                          current += 2;
                          break;
                      }
                  }
  
                  boolean slavoGermanic = isSlavoGermanic();
                  if (input.charAt(current + 1) == 'N') {
                      if (current == 1 && isVowel(0) && !slavoGermanic) {
                          primary.append('K');
                          add('N');
                      } else {
                          //not e.g. 'cagney'
                          if (!input.regionMatches(current + 2, "EY", 0, 2)
                                  && (input.charAt(current + 1) != 'Y')
                                  && !slavoGermanic) {
                              alternate.append('K');
                              add('N');
                          } else {
                              add('K');
                              add('N');
                          }
                          current += 2;
                          break;
                      }
                  }
  
                  //'tagliaro'
                  if (input.regionMatches(current + 1, "LI", 0, 2)
                          && !slavoGermanic) {
                      primary.append('K');
                      add('L');
                      current += 2;
                      break;
                  }
  
                  //-ges-,-gep-,-gel-, -gie- at beginning
                  if((current == 0)
                          && (input.charAt(current + 1) == 'Y'
                          || stringAt(current + 1, 2, EbEiElEpErEsEyIbIlInIe)) ) {
                      add('K', 'J');
                      current += 2;
                      break;
                  }
  
                  // -ger-,  -gy-
                  if ((input.regionMatches(current + 1, "ER", 0, 2)
                          || input.charAt(current + 1) == 'Y')
                          && !stringAt(0, 6, DangerMangerRanger)
                          && !charAt(current - 1, EI)
                          && !stringAt(current - 1, 3, OgyRgy) ) {
                      add('K', 'J');
                      current += 2;
                      break;
                  }
  
                  // italian e.g, 'biaggi'
                  if (charAt(current + 1, EIY)
                          || stringAt(current - 1, 4, AggiOggi)) {
                      //obvious germanic
                      if ((stringAt(0, 4, Van_Von_)
                              || input.regionMatches(0, "SCH", 0, 3))
                              || input.regionMatches(current + 1, "ET", 0, 2)) {
                           add('K');
                      } else {
                          //always soft if french ending
                          if (input.regionMatches(current + 1, "IER ", 0, 4)) {
                              add('J');
                          } else {
                              add('J', 'K');
                          }
                      current += 2;
                      break;
                      }
                  }
  
                  if (input.charAt(current + 1) == 'G') {
                      current += 2;
                  } else {
                      current++;
                  }
                  add('K');
                  break;
  
              case 'H':
                  // only keep if first & before vowel or btw. 2 vowels
                  if ((current == 0 || isVowel(current - 1))
                          && isVowel(current + 1)) {
                      add('H');
                      current += 2;
                  } else { // also takes care of 'HH'
                      current++;
                  }
                  break;
  
              case 'J':
                  //obvious spanish, 'jose', 'san jacinto'
                  if (stringAt(current, 4, "JOSE") || stringAt(0, 4, "SAN ")) {
                      if ((current == 0 && (input.charAt(current + 4) == ' '))
                              || stringAt(0, 4, "SAN ")) {
                          add('H');
                      } else {
                          add('J', 'H');
                      }
                      current +=1;
                      break;
                  }
  
                  if (current == 0 && !stringAt(current, 4, "JOSE")) {
                      add('J', 'A'); // Yankelovich/Jankelowicz
                  } else {
                      // spanish pron. of e.g. 'bajador'
                      if (isVowel(current - 1) && !isSlavoGermanic()
                              && ((input.charAt(current + 1) == 'A')
                              || (input.charAt(current + 1) == 'O'))) {
                          add('J', 'H');
                      } else {
                          if (current == last) {
                              add('J', ' ');
                          } else {
                              if (!charAt(current + 1, BKLMNSTZ)
                                      && !charAt(current - 1, KLS)) {
                                  add('J');
                              }
                           }
                       }
                  }
  
                  current++;
                  if(input.charAt(current) == 'J') current++; // doublecheck
                  break;
  
              case 'K':  // NTR: this is typical default behavior
                  addCode('K', 'K');
                  break;
  
              case 'L':
                  if (input.charAt(current + 1) == 'L') {
                      //spanish e.g. 'cabrillo', 'gallegos'
                      if (((current == (length - 3))
                             && stringAt(current - 1, 4, AlleIllaIllo))
                             || ((stringAt((last - 1), 2, AsOs)
                             || charAt(last, AO))
                             && stringAt(current - 1, 4, "ALLE")) ) {
                          primary.append('L');
                          current += 2;
                          break;
                     }
                     current += 2;
                  } else {
                     current++;
                  }
                  add('L');
                  break;
  
              case 'M':
                  if ((stringAt(current - 1, 3, "UMB")
                          && (((current + 1) == last)
                          || stringAt(current + 2, 2, "ER")))
                          //'dumb','thumb'
                          || (input.charAt(current + 1) == 'M') ) {
                      current += 2;
                  } else {
                      current++;
                  }
                  add('M');
                  break;
  
              case 'N':  // NTR: this is typical default behavior
                  addCode('N', 'N');
                  break;
  
              case '�':
                  current++;
                  add('N');
                  break;
  
              case 'P':
                  if (input.charAt(current + 1) == 'H') {
                      add('F');
                      current += 2;
                      break;
                  }
  
                  //also account for 'campbell', 'raspberry'
                  if (charAt(current + 1, BP))
                      current += 2;
                  else
                      current++;
                  add('P');
                  break;
  
              case 'Q':  // NTR: this is typical default behavior
                  addCode('Q', 'K');
                  break;
  
              case 'R':
                  //french e.g. 'rogier', but exclude 'hochmeier'
                  if ((current == last)
                          && !isSlavoGermanic()
                          && stringAt(current - 2, 2, "IE")
                          && !stringAt(current - 4, 2, MaMe)) {
                      alternate.append('R');
                  } else {
                      add('R');
                  }
  
                  current++;
                  if(input.charAt(current) == 'R') current++; // doublecheck
                  break;
  
              case 'S':
                  //special cases 'island', 'isle', 'carlisle', 'carlysle'
                  if (stringAt(current - 1, 3, IslYsl)) {
                      current++;
                      break;
                  }
  
                  //special case 'sugar-'
                  if ((current == 0) && stringAt(current, 5, "SUGAR")) {
                      add('X', 'S');
                      current++;
                      break;
                  }
  
                  if (stringAt(current, 2, "SH")) {
                      //germanic
                      if (stringAt(current + 1, 4, HeimHoekHolmHolz)) {
                          add('S');
                      } else {
                          add('X');
                      }
                      current += 2;
                      break;
                  }
  
                  //italian & armenian
                  if (stringAt(current, 3, SiaSio)
                          || stringAt(current, 4, "SIAN")) {
                      if (!isSlavoGermanic()) {
                          add('S', 'X');
                      } else {
                          add('S');
                      }
                      current += 3;
                      break;
                  }
  
                  //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' 
match 'schneider'
                  //also, -sz- in slavic language altho in hungarian it is pronounced 
's'
                  if ((current == 0 && charAt(current + 1, LMNW))
                          || input.charAt(current + 1) == 'Z') {
                      add('S', 'X');
                      if (input.charAt(current + 1) == 'Z') {
                          current += 2;
                      } else {
                          current++;
                      }
                      break;
                  }
  
                  if (stringAt(current, 2, "SC")) {
                      //Schlesinger's rule
                      if (input.charAt(current + 2) == 'H') {
                          //dutch origin, e.g. 'school', 'schooner'
                          if (stringAt(current + 3, 2, EdEmEnErOoUy)) {
                              //'schermerhorn', 'schenker'
                              if (stringAt((current + 3), 2, EnEr)) {
                                  add('X', 'S');
                                  alternate.append('K');
                              } else {
                                  add('S');
                                  add('K');
                              }
                              current += 3;
                              break;
                          } else {
                              if (current == 0 && !isVowel(3)
                                      && input.charAt(3) != 'W') {
                                  add('X', 'S');
                              } else {
                                  add('X');
                              }
                              current += 3;
                              break;
                          }
                      }
  
                      if (charAt(current + 2, EIY)) {
                          add('S');
                          current += 3;
                          break;
                      }
  
                      //else
                      add('S');
                      add('K');
                      current += 3;
                      break;
                  }
  
                  //french e.g. 'resnais', 'artois'
                  if (current == last && stringAt(current - 2, 2, AiOi)) {
                      alternate.append('S');
                  } else {
                      add('S');
                  }
  
                  if (charAt(current + 1, SZ)) {
                      current += 2;
                  } else {
                      current++;
                  }
                  break;
  
              case 'T':
                  if (stringAt(current, 4, "TION")) {
                      add('X');
                      current += 3;
                      break;
                  }
  
                  if (stringAt(current, 3, TiaTch)) {
                      add('X');
                      current += 3;
                      break;
                  }
  
                  if (stringAt(current, 2, "TH") || stringAt(current, 3, "TTH")) {
                      //special case 'thomas', 'thames' or germanic
                      if (stringAt(current + 2, 2, AmOm)
                              || stringAt(0, 4, Van_Von_)
                              || stringAt(0, 3, "SCH")) {
                          add('T');
                      } else {
                          add('0', 'T');
                      }
                      current += 2;
                      break;
                  }
  
                  if (charAt(current + 1, DT))
                      current += 2;
                  else
                      current++;
                  add('T');
                  break;
  
              case 'V':  // NTR: this is typical default behavior
                  addCode('V', 'F');
                  break;
  
              case 'W':
                  //can also be in middle of word
                  if (stringAt(current, 2, "WR")) {
                      add('R');
                      current += 2;
                      break;
                  }
  
                  if (current == 0 && (isVowel(current + 1)
                          || stringAt(current, 2, "WH"))) {
                      //Wasserman should match Vasserman
                      if (isVowel(current + 1)) {
                          add('A', 'F');
                      } else {
                          //need 'Uomo' to match 'Womo'
                          add('A');
                      }
                  }
  
                  //'Arnow' should match 'Arnoff'
                  if ((current == last && isVowel(current - 1))
                  || stringAt(current - 1, 5, EwskiEwskyOwskiOwsky)
                  || stringAt(0, 3, "SCH")) {
                      alternate.append('F');
                      current +=1;
                      break;
                  }
  
                  //polish e.g. 'filipowicz'
                  if (stringAt(current, 4, WiczWitz)) {
                      add('T', 'F');
                      add('S', 'X');
                      current +=4;
                      break;
                  }
  
                  //else skip it
                  current +=1;
                  break;
  
              case 'X':
                  //french e.g. breaux
                  if (!(current == last && (stringAt((current - 3), 3, EauIau)
                  || stringAt((current - 2), 2, AuOu))) ) {
                      add('K');
                      add('S');
                  }
  
                  if (charAt(current + 1, CX)) {
                      current += 2;
                  } else {
                      current++;
                  }
                  break;
  
              case 'Z':
                  //chinese pinyin e.g. 'zhao'
                  if (input.charAt(current + 1) == 'H') {
                      add('J');
                      current += 2;
                      break;
                  } else {
                      if (stringAt(current + 1, 2, ZaZiZo)
                      || (isSlavoGermanic() && (current > 0
                      && input.charAt(current - 1) != 'T'))) {
                          alternate.append('T');
                          add('S');
                      } else {
                          add('S');
                      }
                  }
  
                  if (input.charAt(current + 1) == 'Z') {
                      current += 2;
                  } else {
                      current++;
                  }
                  break;
  
              case '0':
              case '1':
              case '2':
              case '3':
              case '4':
              case '5':
              case '6':
              case '7':
              case '8':
              case '9':
                  add(input.charAt(current));
                  current++;
                  break;
  
              default:
                  current++;
          } // switch
      } // while
  
      // Only give back the specified length
      if (primary.length() > encodeLimit) {
          primary.delete(encodeLimit, primary.length());
      }
      if (alternate.length() > encodeLimit) {
          alternate.delete(encodeLimit, alternate.length());
      }
  
      return primary.toString();
    }
  
    /**
     * Check if the two strings encode to the same primary or alternate encodings
     * using the Double Metaphone algorithm.
     * @param s1
     * @param s2
     * @return true/false
     */
    public static boolean isEncodeEqual( String s1, String s2 ) {
      DoubleMetaphone dm1 = new DoubleMetaphone( s1 );
      DoubleMetaphone dm2 = new DoubleMetaphone( s2 );
      dm1.encode();
      dm2.encode();
      return dm1.getPrimary().equals( dm2.getPrimary() )
          || dm1.getPrimary().equals( dm2.getAlternate() )
          || dm1.getAlternate().equals( dm2.getPrimary() );
    }
  }
  
  
  
  
  1.3       +7 -4      
jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java
  
  Index: TestAll.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/TestAll.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- TestAll.java      18 Nov 2002 13:00:26 -0000      1.2
  +++ TestAll.java      3 Feb 2003 15:00:12 -0000       1.3
  @@ -61,6 +61,8 @@
   
   package org.apache.commons.codec;
   
  +import org.apache.commons.codec.language.TestDoubleMetaphone;
  +
   import junit.framework.Test;
   import junit.framework.TestCase;
   import junit.framework.TestSuite;
  @@ -81,6 +83,7 @@
           suite.addTest(TestMetaphone.suite());
           suite.addTest(TestSoundex.suite());
           suite.addTest(TestRefinedSoundex.suite());
  +     suite.addTest(TestDoubleMetaphone.suite());
           return suite;
       }
           
  
  
  
  1.1                  
jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java
  
  Index: TestDoubleMetaphone.java
  ===================================================================
  /*
   * $Header: 
/home/cvs/jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language/TestDoubleMetaphone.java,v
 1.1 2003/02/03 15:00:12 tobrien Exp $
   * $Revision: 1.1 $
   * $Date: 2003/02/03 15:00:12 $
   *
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2002 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Commons", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written
   *    permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   */
  package org.apache.commons.codec.language;
  
  import org.apache.commons.codec.Encoder;
  import org.apache.commons.codec.TestEncoder;
  
  import junit.framework.Test;
  import junit.framework.TestCase;
  import junit.framework.TestSuite;
  
  /**
   * @version $Revision: 1.1 $ $Date: 2003/02/03 15:00:12 $
   * @author <a href="[EMAIL PROTECTED]">Kyle R. Burton</a>
   */
  public class TestDoubleMetaphone extends TestEncoder {
  
    public TestDoubleMetaphone(String name) {
      super(name);
    }
  
    public static Test suite() {
      return (new TestSuite(TestDoubleMetaphone.class));
    }
  
    public void setUp() throws Exception {        
      super.setUp();
      _encoder = new DoubleMetaphone();
    }
  
    public void tearDown() throws Exception {
      super.tearDown();
      _encoder = null;
    }
  
    protected Encoder makeEncoder() {
      return new DoubleMetaphone();
    }
    
    // ------------------------------------------------------------------------
  
    public void testDoubleMetaphone() {
      for(int i = 0; i < words.length; ++i) {
        assertEquals(
          "encoding: " + words[i],
          encodings[i],
          _encoder.encode(words[i])
        );
      }
    }
  
    public void testIsDoubleMetaphoneEqual() {
      // need good examples of when two strings should encode to 
      // the same values...
    }
  
    private DoubleMetaphone _encoder = null;
        // These tests were taken from the Text::DoubleMetaphone 
        // Perl module available from CPAN
    private String [] words = {
      //"maurice",
      "aubrey",
      "cambrillo",
      "heidi",
      "katherine",
      "catherine",
      "richard",
      "bob",
      "eric",
      "geoff",
      "dave",
      "ray",
      "steven",
      //"bryce",
      "randy",
      "bryan",
      "brian",
      "otto",
      "auto",
    };
    private String [] encodings = {
        //    "MRS",
      "APR",
      "KMPR",
      "HT",
      "K0RN",
      "K0RN",
      "RXRT",
      "PP",
      "ARK",
      "JF",
      "TF",
      "R",
      "STFN",
      //"PRS",
      "RNT",
      "PRN",
      "PRN",
      "AT",
      "AT",
    };
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-commons-sandbox/codec/src/test/org/apache/commons/codec/language TestDoubleMetaphone.java

Reply via email to