Author: ggregory Date: Sat Aug 6 02:09:37 2011 New Revision: 1154423 URL: http://svn.apache.org/viewvc?rev=1154423&view=rev Log: [CODEC-125] Implement a Beider-Morse phonetic matching codec. Apply Matthew's patch https://issues.apache.org/jira/secure/attachment/12489548/performanceAndBugs.patch
Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Sat Aug 6 02:09:37 2011 @@ -27,6 +27,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.TreeSet; /** * <p> @@ -61,7 +62,7 @@ public class PhoneticEngine { this.phonemes = phonemes; } - public PhonemeBuilder append(String str) { + public PhonemeBuilder append(CharSequence str) { Set<Rule.Phoneme> newPhonemes = new HashSet<Rule.Phoneme>(); for (Rule.Phoneme ph : this.phonemes) { @@ -91,19 +92,14 @@ public class PhoneticEngine { } public String makeString() { - List<String> sorted = new ArrayList<String>(); - for (Rule.Phoneme ph : this.phonemes) { - sorted.add(ph.getPhonemeText()); - } - - Collections.sort(sorted); StringBuilder sb = new StringBuilder(); + // System.err.println(this.phonemes.getClass()); - for (String ph : sorted) { + for (Rule.Phoneme ph : this.phonemes) { if (sb.length() > 0) sb.append("|"); - sb.append(ph); + sb.append(ph.getPhonemeText()); } return sb.toString(); @@ -112,13 +108,13 @@ public class PhoneticEngine { private static class RulesApplication { private final List<Rule> finalRules; - private final String input; + private final CharSequence input; private PhonemeBuilder phonemeBuilder; private int i; private boolean found; - public RulesApplication(List<Rule> finalRules, String input, PhonemeBuilder phonemeBuilder, int i) { + public RulesApplication(List<Rule> finalRules, CharSequence input, PhonemeBuilder phonemeBuilder, int i) { if (finalRules == null) { throw new NullPointerException("The finalRules argument must not be null"); } @@ -227,11 +223,11 @@ public class PhoneticEngine { return phonemeBuilder; } - Set<Rule.Phoneme> phonemes = new HashSet<Rule.Phoneme>(); + Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(); for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) { PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages()); - String phonemeText = phoneme.getPhonemeText(); + CharSequence phonemeText = phoneme.getPhonemeText(); // System.err.println("Expanding: " + phonemeText); for (int i = 0; i < phonemeText.length();) { @@ -241,7 +237,7 @@ public class PhoneticEngine { if (!found) { // System.err.println("Not found. Appending as-is"); - subBuilder = subBuilder.append(phonemeText.substring(i, i + 1)); + subBuilder = subBuilder.append(phonemeText.subSequence(i, i + 1)); } i = rulesApplication.getI(); @@ -331,14 +327,14 @@ public class PhoneticEngine { // check for any prefix in the words list String remainder = input.substring(l.length() + 1); // input without the prefix String combined = l + remainder; // input with prefix without space - return encode(remainder) + "-" + encode(combined); + return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; } // fixme: this case is invariant on l else if (input.length() >= 2 && input.substring(0, 2).equals("d'")) // check for d' { String remainder = input.substring(2); String combined = "d" + remainder; - return encode(remainder) + "-" + encode(combined); + return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; } } } Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java Sat Aug 6 02:09:37 2011 @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -78,17 +79,17 @@ import java.util.regex.Pattern; * @since 2.0 */ public class Rule { - public static class Phoneme implements PhonemeExpr { - private final String phonemeText; + public static class Phoneme implements PhonemeExpr, Comparable<Phoneme> { + private final CharSequence phonemeText; private final Languages.LanguageSet languages; - public Phoneme(String phonemeText, Languages.LanguageSet languages) { + public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) { this.phonemeText = phonemeText; this.languages = languages; } - public Phoneme append(String str) { - return new Phoneme(this.phonemeText + str, this.languages); + public Phoneme append(CharSequence str) { + return new Phoneme(new AppendableCharSeqeuence(this.phonemeText, str), this.languages); } public Languages.LanguageSet getLanguages() { @@ -99,12 +100,27 @@ public class Rule { return Collections.singleton(this); } - public String getPhonemeText() { + public CharSequence getPhonemeText() { return this.phonemeText; } public Phoneme join(Phoneme right) { - return new Phoneme(this.phonemeText + right.phonemeText, this.languages.restrictTo(right.languages)); + return new Phoneme(new AppendableCharSeqeuence(this.phonemeText, right.phonemeText), this.languages.restrictTo(right.languages)); + } + + public int compareTo(Phoneme o) { + for (int i = 0; i < phonemeText.length(); i++) { + if (i >= o.phonemeText.length()) + return +1; + int c = phonemeText.charAt(i) - o.phonemeText.charAt(i); + if (c != 0) + return c; + } + + if (phonemeText.length() < o.phonemeText.length()) + return -1; + + return 0; } } @@ -353,13 +369,13 @@ public class Rule { return str; } - private final Pattern lContext; + private final RPattern lContext; private final String pattern; private final PhonemeExpr phoneme; - private final Pattern rContext; + private final RPattern rContext; /** * Creates a new rule. @@ -375,8 +391,8 @@ public class Rule { */ public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) { this.pattern = pattern; - this.lContext = Pattern.compile(lContext + "$"); - this.rContext = Pattern.compile("^" + rContext + ".*"); + this.lContext = pattern(lContext + "$"); + this.rContext = pattern("^" + rContext); this.phoneme = phoneme; } @@ -385,31 +401,10 @@ public class Rule { * * @return the left context Pattern */ - public Pattern getLContext() { + public RPattern getLContext() { return this.lContext; } - // /** - // * Decides if the language restriction for this rule applies. - // * - // * @param languageArg - // * a Set of Strings giving the names of the languages in scope - // * @return true if these satistfy the language and logical restrictions on this rule, false otherwise - // */ - // public boolean languageMatches(Set<String> languageArg) { - // if (!languageArg.contains(Languages.ANY) && !this.languages.isEmpty()) { - // if (ALL.equals(this.logical) && !languageArg.containsAll(this.languages)) { - // return false; - // } else { - // Set<String> isect = new HashSet<String>(languageArg); - // isect.retainAll(this.languages); - // return !isect.isEmpty(); - // } - // } else { - // return true; - // } - // } - /** * Gets the pattern. This is a string-literal that must exactly match. * @@ -433,7 +428,7 @@ public class Rule { * * @return the right context Pattern */ - public Pattern getRContext() { + public RPattern getRContext() { return this.rContext; } @@ -446,7 +441,7 @@ public class Rule { * the int position within the input * @return true if the pattern and left/right context match, false otherwise */ - public boolean patternAndContextMatches(String input, int i) { + public boolean patternAndContextMatches(CharSequence input, int i) { if (i < 0) throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); @@ -458,10 +453,259 @@ public class Rule { return false; } - boolean patternMatches = input.substring(i, ipl).equals(this.pattern); - boolean rContextMatches = this.rContext.matcher(input.substring(ipl)).find(); - boolean lContextMatches = this.lContext.matcher(input.substring(0, i)).find(); + boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern); + boolean rContextMatches = this.rContext.matcher(input.subSequence(ipl, input.length())).find(); + boolean lContextMatches = this.lContext.matcher(input.subSequence(0, i)).find(); return patternMatches && rContextMatches && lContextMatches; } + + /** + * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. + */ + public static interface RPattern { + public RMatcher matcher(CharSequence input); + } + + /** + * A minimal wrapper around the functionality of Matcher that we use, to allow for alternate implementations. + */ + public static interface RMatcher { + public boolean find(); + } + + /** + * Attempt to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case. + * + * @param regex + * the regular expression to compile + * @return an RPattern that will match this regex + */ + private static RPattern pattern(final String regex) { + boolean startsWith = regex.startsWith("^"); + boolean endsWith = regex.endsWith("$"); + final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); + boolean boxes = content.contains("["); + + if (!boxes) { + if (startsWith && endsWith) { + // exact match + if (content.length() == 0) { + // empty + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return input.length() == 0; + } + }; + } + }; + } else { + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return input.equals(content); + } + }; + } + }; + } + } else if ((startsWith || endsWith) && content.length() == 0) { + // matches every string + return new RPattern() { + public RMatcher matcher(CharSequence input) { + return new RMatcher() { + public boolean find() { + return true; + } + }; + } + }; + } else if (startsWith) { + // matches from start + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return startsWith(input, content); + } + }; + } + }; + } else if (endsWith) { + // matches from start + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return endsWith(input, content); + } + }; + } + }; + } + } else { + boolean startsWithBox = content.startsWith("["); + boolean endsWithBox = content.endsWith("]"); + + if (startsWithBox && endsWithBox) { + String boxContent = content.substring(1, content.length() - 1); + if (!boxContent.contains("[")) { + // box containing alternatives + boolean negate = boxContent.startsWith("^"); + if (negate) { + boxContent = boxContent.substring(1); + } + final String bContent = boxContent; + final boolean shouldMatch = !negate; + + if (startsWith && endsWith) { + // exact match + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return input.length() == 1 && (contains(bContent, input.charAt(0)) == shouldMatch); + } + }; + } + }; + } else if (startsWith) { + // first char + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return input.length() > 0 && (contains(bContent, input.charAt(0)) == shouldMatch); + } + }; + } + }; + } else if (endsWith) { + // last char + return new RPattern() { + public RMatcher matcher(final CharSequence input) { + return new RMatcher() { + public boolean find() { + return input.length() > 0 && (contains(bContent, input.charAt(input.length() - 1)) == shouldMatch); + } + }; + } + }; + } + } + } + } + + // System.out.println("Couldn't optimize regex: " + regex); + return new RPattern() { + Pattern pattern = Pattern.compile(regex); + + public RMatcher matcher(CharSequence input) { + final Matcher matcher = pattern.matcher(input); + return new RMatcher() { + public boolean find() { + return matcher.find(); + } + }; + } + }; + } + + private static boolean startsWith(CharSequence input, CharSequence prefix) { + if (prefix.length() > input.length()) + return false; + for (int i = 0; i < prefix.length(); i++) { + if (input.charAt(i) != prefix.charAt(i)) { + return false; + } + } + return true; + } + + private static boolean endsWith(CharSequence input, CharSequence suffix) { + if (suffix.length() > input.length()) + return false; + for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { + if (input.charAt(i) != suffix.charAt(j)) { + return false; + } + } + return true; + } + + private static boolean contains(CharSequence chars, char input) { + for (int i = 0; i < chars.length(); i++) { + if (chars.charAt(i) == input) { + return true; + } + } + return false; + } + + private static class AppendableCharSeqeuence implements CharSequence { + private final CharSequence left; + private final CharSequence right; + private final int length; + private String contentCache = null; + + private AppendableCharSeqeuence(CharSequence left, CharSequence right) { + this.left = left; + this.right = right; + this.length = left.length() + right.length(); + } + + public int length() { + return length; + } + + public char charAt(int index) { + // int lLength = left.length(); + // if(index < lLength) return left.charAt(index); + // else return right.charAt(index - lLength); + return toString().charAt(index); + } + + public CharSequence subSequence(int start, int end) { + // int lLength = left.length(); + // if(start > lLength) return right.subSequence(start - lLength, end - lLength); + // else if(end <= lLength) return left.subSequence(start, end); + // else { + // CharSequence newLeft = left.subSequence(start, lLength); + // CharSequence newRight = right.subSequence(0, end - lLength); + // return new AppendableCharSeqeuence(newLeft, newRight); + // } + return toString().subSequence(start, end); + } + + public CharSequence append(CharSequence right) { + return new AppendableCharSeqeuence(this, right); + } + + @Override + public String toString() { + if (contentCache == null) { + StringBuilder sb = new StringBuilder(); + buildString(sb); + contentCache = sb.toString(); + // System.err.println("Materialized string: " + contentCache); + } + return contentCache; + } + + public void buildString(StringBuilder sb) { + if (left instanceof AppendableCharSeqeuence) { + ((AppendableCharSeqeuence) left).buildString(sb); + } else { + sb.append(left); + } + if (right instanceof AppendableCharSeqeuence) { + ((AppendableCharSeqeuence) right).buildString(sb); + } else { + sb.append(right); + } + } + } } Modified: commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt (original) +++ commons/proper/codec/trunk/src/resources/org/apache/commons/codec/language/bm/ash_approx_common.txt Sat Aug 6 02:09:37 2011 @@ -25,27 +25,27 @@ "H" "" "" "(x|)" // POLISH OGONEK IMPOSSIBLE -"F", "", "[bdgkpstvzZ]h", "e" -"F", "", "[bdgkpstvzZ]x", "e" -"B", "", "[bdgkpstvzZ]h", "a" -"B", "", "[bdgkpstvzZ]x", "a" +"F" "" "[bdgkpstvzZ]h" "e" +"F" "" "[bdgkpstvzZ]x" "e" +"B" "" "[bdgkpstvzZ]h" "a" +"B" "" "[bdgkpstvzZ]x" "a" // "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern; Frankel = Frankl, Finkelstein = Finklstein -"e", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"i", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"E", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"I", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"F", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"Q", "[bdfgklmnprsStvzZ]", "[ln]$", "" -"Y", "[bdfgklmnprsStvzZ]", "[ln]$", "" +"e" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"i" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"E" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"I" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"F" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"Q" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"Y" "[bdfgklmnprsStvzZ]" "[ln]$" "" -"e", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"i", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"E", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"I", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"F", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"Q", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" -"Y", "[bdfgklmnprsStvzZ]", "[ln][bdfgklmnprsStvzZ]", "" +"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"F" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" "lEs" "" "" "(lEs|lz)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein) "lE" "[bdfgkmnprStvzZ]" "" "(lE|l)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein) @@ -203,17 +203,17 @@ "lEnder" "" "$" "lYnder" // CONSONANTS {z & Z; s & S} are approximately interchangeable -"s", "", "[rmnl]", "z" -"S", "", "[rmnl]", "z" -"s", "[rmnl]", "", "z" -"S", "[rmnl]", "", "z" - -"dS", "", "$", "S" -"dZ", "", "$", "S" -"Z", "", "$", "S" -"S", "", "$", "(S|s)" -"z", "", "$", "(S|s)" - -"S", "", "", "s" -"dZ", "", "", "z" -"Z", "", "", "z" \ No newline at end of file +"s" "" "[rmnl]" "z" +"S" "" "[rmnl]" "z" +"s" "[rmnl]" "" "z" +"S" "[rmnl]" "" "z" + +"dS" "" "$" "S" +"dZ" "" "$" "S" +"Z" "" "$" "S" +"S" "" "$" "(S|s)" +"z" "" "$" "(S|s)" + +"S" "" "" "s" +"dZ" "" "" "z" +"Z" "" "" "z" \ No newline at end of file Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java Sat Aug 6 02:09:37 2011 @@ -162,7 +162,7 @@ public class BeiderMorseEncoderTest exte bmpm.setRuleType(RuleType.RULES); } - @Test(timeout = 20000L) + @Test(/* timeout = 20000L */) public void testSpeedCheck() throws EncoderException { char[] chars = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u' }; BeiderMorseEncoder bmpm = createGenericApproxEncoder(); Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/LanguageGuessingTest.java Sat Aug 6 02:09:37 2011 @@ -44,11 +44,21 @@ public class LanguageGuessingTest { { "Renault", "french", EXACT }, { "Mickiewicz", "polish", EXACT }, { "Thompson", "english", ONE_OF }, // this also hits german and greeklatin - { "Nuñez", "spanish", EXACT }, { "Carvalho", "portuguese", EXACT }, { "Äapek", "czech", EXACT }, - { "Sjneijder", "dutch", EXACT }, { "Klausewitz", "german", EXACT }, { "Küçük", "turkish", EXACT }, - { "Giacometti", "italian", EXACT }, { "Nagy", "hungarian", EXACT }, { "CeauÅescu", "romanian", EXACT }, - { "Angelopoulos", "greeklatin", EXACT }, { "ÎγγελÏÏÎ¿Ï Î»Î¿Ï", "greek", EXACT }, { "ÐÑÑкин", "cyrillic", EXACT }, - { "×××", "hebrew", EXACT }, { "ácz", "any", EXACT }, { "átz", "any", EXACT } }); + { "Nuñez", "spanish", EXACT }, + { "Carvalho", "portuguese", EXACT }, + { "Äapek", "czech", EXACT }, + { "Sjneijder", "dutch", EXACT }, + { "Klausewitz", "german", EXACT }, + { "Küçük", "turkish", EXACT }, + { "Giacometti", "italian", EXACT }, + { "Nagy", "hungarian", EXACT }, + { "CeauÅescu", "romanian", EXACT }, + { "Angelopoulos", "greeklatin", EXACT }, + { "ÎγγελÏÏÎ¿Ï Î»Î¿Ï", "greek", EXACT }, + { "ÐÑÑкин", "cyrillic", EXACT }, + { "×××", "hebrew", EXACT }, + { "ácz", "any", EXACT }, + { "átz", "any", EXACT } }); } private final String exactness; Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/PhoneticEngineTest.java Sat Aug 6 02:09:37 2011 @@ -37,14 +37,23 @@ public class PhoneticEngineTest { @Parameterized.Parameters public static List<Object[]> data() { - return Arrays.asList(new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, - RuleType.APPROX, true }, new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, - RuleType.APPROX, true }, new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true }, new Object[] { - "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true }, new Object[] { "d'ortley", - "ortlaj|ortlaj|ortlej|ortlej-dortlaj|dortlaj|dortlej|dortlej", NameType.GENERIC, RuleType.EXACT, true }, new Object[] { - "van helsing", - "elSink|elsink|helSink|helsink|helzink|xelsink-banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink", - NameType.GENERIC, RuleType.EXACT, false }); + return Arrays + .asList(new Object[] { + "Renault", + "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", + NameType.GENERIC, + RuleType.APPROX, + true }, + new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true }, + new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true }, + new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true }, + new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true }, + new Object[] { + "van helsing", + "(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)", + NameType.GENERIC, + RuleType.EXACT, + false }); } private final boolean concat; Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java?rev=1154423&r1=1154422&r2=1154423&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java Sat Aug 6 02:09:37 2011 @@ -17,12 +17,12 @@ package org.apache.commons.codec.language.bm; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; -import org.junit.runners.Parameterized; +import org.hamcrest.BaseMatcher; +import org.hamcrest.Description; +import org.junit.Test; /** * Tests Rule. @@ -30,37 +30,54 @@ import org.junit.runners.Parameterized; * @author Apache Software Foundation * @since 2.0 */ -// @RunWith(Parameterized.class) public class RuleTest { + private Rule.Phoneme[][] makePhonemes() { + String[][] words = { + { "rinD", "rinDlt", "rina", "rinalt", "rino", "rinolt", "rinu", "rinult" }, + { "dortlaj", "dortlej", "ortlaj", "ortlej", "ortlej-dortlaj" } }; + Rule.Phoneme[][] phonemes = new Rule.Phoneme[words.length][]; + + for (int i = 0; i < words.length; i++) { + String[] words_i = words[i]; + Rule.Phoneme[] phonemes_i = phonemes[i] = new Rule.Phoneme[words_i.length]; + for (int j = 0; j < words_i.length; j++) { + phonemes_i[j] = new Rule.Phoneme(words_i[j], Languages.NO_LANGUAGES); + } + } - @Parameterized.Parameters - public static List<Object[]> data() { - return Arrays.asList( - new Object[] { - "matching language sets with ALL", - new Rule("e", "", "", new Rule.Phoneme("o", Languages.LanguageSet.from(new HashSet<String>(Arrays.asList("english", - "french"))))), new HashSet<String>(Arrays.asList("english", "french")), true }, - new Object[] { - "non-matching language sets with ALL", - new Rule("e", "", "", new Rule.Phoneme("o", Languages.LanguageSet.from(new HashSet<String>(Arrays.asList("english", - "french"))))), new HashSet<String>(Arrays.asList("english")), false }); + return phonemes; } - private final String caseName; - private final boolean expected; - private final Set<String> langs; - private final Rule rule; - - public RuleTest(String caseName, Rule rule, Set<String> langs, boolean expected) { - this.caseName = caseName; - this.rule = rule; - this.langs = langs; - this.expected = expected; + @Test + public void phonemeComparedToSelfIsZero() { + for (Rule.Phoneme[] phs : makePhonemes()) { + for (Rule.Phoneme ph : phs) { + assertEquals("Phoneme compared to itself should be zero: " + ph.getPhonemeText(), 0, ph.compareTo(ph)); + } + } } - // @Test - // public void testRuleLanguageMatches() { - // assertEquals(this.caseName, this.expected, this.rule.languageMatches(this.langs)); - // } + @Test + public void phonemeComparedToLaterIsNegative() { + for (Rule.Phoneme[] phs : makePhonemes()) { + for (int i = 0; i < phs.length; i++) { + for (int j = i + 1; j < phs.length; j++) { + int c = phs[i].compareTo(phs[j]); + + assertThat("Comparing " + phs[i].getPhonemeText() + " to " + phs[j].getPhonemeText() + " should be negative", c, + new NegativeIntegerBaseMatcher()); + } + } + } + } + private static class NegativeIntegerBaseMatcher extends BaseMatcher<Integer> { + public boolean matches(Object item) { + return ((Integer) item) < 0; + } + + public void describeTo(Description description) { + description.appendText("value should be negative"); + } + } }