Author: ggregory Date: Mon Aug 8 22:47:24 2011 New Revision: 1155135 URL: http://svn.apache.org/viewvc?rev=1155135&view=rev Log: [CODEC-125] Implement a Beider-Morse phonetic matching codec. Applied patch https://issues.apache.org/jira/secure/attachment/12489755/fightingMemoryChurn.patch
Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/NameType.java commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Lang.java Mon Aug 8 22:47:24 2011 @@ -24,6 +24,7 @@ import java.util.Collections; import java.util.EnumMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Scanner; import java.util.Set; @@ -71,7 +72,7 @@ import java.util.regex.Pattern; */ public class Lang { - private static class LangRule { + private static final class LangRule { private final boolean acceptOnMatch; private final Set<String> languages; private final Pattern pattern; @@ -199,7 +200,7 @@ public class Lang { */ public String guessLanguage(String text) { Languages.LanguageSet ls = guessLanguages(text); - return ls.isSingleton() ? ls.getAny() : Languages.ANY; + return ls.isSingleton() ? ls.getAny() : Languages.ANY; } /** @@ -210,7 +211,7 @@ public class Lang { * @return a Set of Strings of language names that are potential matches for the input word */ public Languages.LanguageSet guessLanguages(String input) { - String text = input.toLowerCase(); // todo: locale? + String text = input.toLowerCase(Locale.ENGLISH); // System.out.println("Testing text: '" + text + "'"); Set<String> langs = new HashSet<String>(this.languages.getLanguages()); Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Languages.java Mon Aug 8 22:47:24 2011 @@ -58,9 +58,9 @@ public class Languages { * A set of languages. */ public static abstract class LanguageSet { - + public static LanguageSet from(Set<String> langs) { - return langs.isEmpty() ? NO_LANGUAGES : new SomeLanguages(langs); + return langs.isEmpty() ? NO_LANGUAGES : new SomeLanguages(langs); } public abstract boolean contains(String language); @@ -77,7 +77,7 @@ public class Languages { /** * Some languages, explicitly enumerated. */ - public static class SomeLanguages extends LanguageSet { + public static final class SomeLanguages extends LanguageSet { private final Set<String> languages; private SomeLanguages(Set<String> languages) { @@ -116,9 +116,13 @@ public class Languages { return this; } else { SomeLanguages sl = (SomeLanguages) other; - Set<String> ls = new HashSet<String>(this.languages); - ls.retainAll(sl.languages); - return from(ls); + if (sl.languages.containsAll(languages)) { + return this; + } else { + Set<String> ls = new HashSet<String>(this.languages); + ls.retainAll(sl.languages); + return from(ls); + } } } Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/NameType.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/NameType.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/NameType.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/NameType.java Mon Aug 8 22:47:24 2011 @@ -24,13 +24,13 @@ package org.apache.commons.codec.languag * @since 2.0 */ public enum NameType { - + /** Ashkenazi family names */ ASHKENAZI("ash"), - + /** Generic names and words */ GENERIC("gen"), - + /** Sephardic family names */ SEPHARDIC("sep"); Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/PhoneticEngine.java Mon Aug 8 22:47:24 2011 @@ -51,7 +51,7 @@ import java.util.TreeSet; */ public class PhoneticEngine { - static class PhonemeBuilder { + static final class PhonemeBuilder { public static PhonemeBuilder empty(Languages.LanguageSet languages) { return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages))); @@ -108,7 +108,7 @@ public class PhoneticEngine { } } - private static class RulesApplication { + private static final class RulesApplication { private final List<Rule> finalRules; private final CharSequence input; @@ -176,6 +176,32 @@ public class PhoneticEngine { "de la", "della", "des", "di", "do", "dos", "du", "van", "von")))); } + private static CharSequence cacheSubSequence(final CharSequence cached) { + // return cached; + final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()]; + return new CharSequence() { + public char charAt(int index) { + return cached.charAt(index); + } + + public int length() { + return cached.length(); + } + + public CharSequence subSequence(int start, int end) { + if (start == end) + return ""; + + CharSequence res = cache[start][end - 1]; + if (res == null) { + res = cached.subSequence(start, end); + cache[start][end - 1] = res; + } + return res; + } + }; + } + private static String join(Iterable<String> strings, String sep) { StringBuilder sb = new StringBuilder(); Iterator<String> si = strings.iterator(); @@ -229,7 +255,7 @@ public class PhoneticEngine { for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) { PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages()); - CharSequence phonemeText = phoneme.getPhonemeText(); + CharSequence phonemeText = cacheSubSequence(phoneme.getPhonemeText()); // System.err.println("Expanding: " + phonemeText); for (int i = 0; i < phonemeText.length();) { @@ -248,7 +274,7 @@ public class PhoneticEngine { } // System.err.println("Expanded to: " + subBuilder.makeString()); - + // System.err.println("phenomes in collection of type: " + subBuilder.getPhonemes().getClass()); phonemes.addAll(subBuilder.getPhonemes()); } @@ -345,8 +371,9 @@ public class PhoneticEngine { PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet); // loop over each char in the input - we will handle the increment manually - for (int i = 0; i < input.length();) { - RulesApplication rulesApplication = new RulesApplication(rules, input, phonemeBuilder, i).invoke(); + CharSequence inputCache = cacheSubSequence(input); + for (int i = 0; i < inputCache.length();) { + RulesApplication rulesApplication = new RulesApplication(rules, inputCache, phonemeBuilder, i).invoke(); i = rulesApplication.getI(); phonemeBuilder = rulesApplication.getPhonemeBuilder(); // System.err.println(input + " " + i + ": " + phonemeBuilder.makeString()); Modified: commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java (original) +++ commons/proper/codec/trunk/src/java/org/apache/commons/codec/language/bm/Rule.java Mon Aug 8 22:47:24 2011 @@ -80,68 +80,7 @@ import java.util.regex.Pattern; */ public class Rule { - private static class AppendableCharSeqeuence implements CharSequence { - - private final CharSequence left; - private final CharSequence right; - private final int length; - private String contentCache = null; - - private AppendableCharSeqeuence(CharSequence left, CharSequence right) { - this.left = left; - this.right = right; - this.length = left.length() + right.length(); - } - - public void buildString(StringBuilder sb) { - if (left instanceof AppendableCharSeqeuence) { - ((AppendableCharSeqeuence) left).buildString(sb); - } else { - sb.append(left); - } - if (right instanceof AppendableCharSeqeuence) { - ((AppendableCharSeqeuence) right).buildString(sb); - } else { - sb.append(right); - } - } - - public char charAt(int index) { - // int lLength = left.length(); - // if(index < lLength) return left.charAt(index); - // else return right.charAt(index - lLength); - return toString().charAt(index); - } - - public int length() { - return length; - } - - public CharSequence subSequence(int start, int end) { - // int lLength = left.length(); - // if(start > lLength) return right.subSequence(start - lLength, end - lLength); - // else if(end <= lLength) return left.subSequence(start, end); - // else { - // CharSequence newLeft = left.subSequence(start, lLength); - // CharSequence newRight = right.subSequence(0, end - lLength); - // return new AppendableCharSeqeuence(newLeft, newRight); - // } - return toString().subSequence(start, end); - } - - @Override - public String toString() { - if (contentCache == null) { - StringBuilder sb = new StringBuilder(); - buildString(sb); - contentCache = sb.toString(); - // System.err.println("Materialized string: " + contentCache); - } - return contentCache; - } - } - - public static class Phoneme implements PhonemeExpr, Comparable<Phoneme> { + public static final class Phoneme implements PhonemeExpr, Comparable<Phoneme> { private final CharSequence phonemeText; private final Languages.LanguageSet languages; @@ -152,7 +91,7 @@ public class Rule { } public Phoneme append(CharSequence str) { - return new Phoneme(new AppendableCharSeqeuence(this.phonemeText, str), this.languages); + return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); } public int compareTo(Phoneme o) { @@ -186,7 +125,7 @@ public class Rule { } public Phoneme join(Phoneme right) { - return new Phoneme(new AppendableCharSeqeuence(this.phonemeText, right.phonemeText), this.languages.restrictTo(right.languages)); + return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), this.languages.restrictTo(right.languages)); } } @@ -194,7 +133,7 @@ public class Rule { Iterable<Phoneme> getPhonemes(); } - public static class PhonemeList implements PhonemeExpr { + public static final class PhonemeList implements PhonemeExpr { private final List<Phoneme> phonemes; public PhonemeList(List<Phoneme> phonemes) { @@ -207,44 +146,17 @@ public class Rule { } /** - * A minimal wrapper around the functionality of Matcher that we use, to allow for alternate implementations. + * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. */ - public static interface RMatcher { - boolean find(); + public static interface RPattern { + boolean isMatch(CharSequence input); } - /** - * Always returns true. - */ - private static class TrueRMatcher implements RMatcher { - - private static TrueRMatcher INSTANCE = new TrueRMatcher(); - - /** - * Always returns true. - */ - public boolean find() { + public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { + public boolean isMatch(CharSequence input) { return true; } - - } - - private static class AllStringsRMatcher implements RPattern { - - private static AllStringsRMatcher INSTANCE = new AllStringsRMatcher(); - - public RMatcher matcher(CharSequence input) { - return TrueRMatcher.INSTANCE; - } - - } - - /** - * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations. - */ - public static interface RPattern { - RMatcher matcher(CharSequence input); - } + }; public static final String ALL = "ALL"; @@ -500,48 +412,32 @@ public class Rule { if (content.length() == 0) { // empty return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return input.length() == 0; - } - }; + public boolean isMatch(CharSequence input) { + return input.length() == 0; } }; } else { return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return input.equals(content); - } - }; + public boolean isMatch(CharSequence input) { + return input.equals(content); } }; } } else if ((startsWith || endsWith) && content.length() == 0) { // matches every string - return AllStringsRMatcher.INSTANCE; + return ALL_STRINGS_RMATCHER; } else if (startsWith) { // matches from start return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return startsWith(input, content); - } - }; + public boolean isMatch(CharSequence input) { + return startsWith(input, content); } }; } else if (endsWith) { // matches from start return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return endsWith(input, content); - } - }; + public boolean isMatch(CharSequence input) { + return endsWith(input, content); } }; } @@ -563,34 +459,22 @@ public class Rule { if (startsWith && endsWith) { // exact match return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return input.length() == 1 && (contains(bContent, input.charAt(0)) == shouldMatch); - } - }; + public boolean isMatch(CharSequence input) { + return input.length() == 1 && (contains(bContent, input.charAt(0)) == shouldMatch); } }; } else if (startsWith) { // first char return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return input.length() > 0 && (contains(bContent, input.charAt(0)) == shouldMatch); - } - }; + public boolean isMatch(CharSequence input) { + return input.length() > 0 && (contains(bContent, input.charAt(0)) == shouldMatch); } }; } else if (endsWith) { // last char return new RPattern() { - public RMatcher matcher(final CharSequence input) { - return new RMatcher() { - public boolean find() { - return input.length() > 0 && (contains(bContent, input.charAt(input.length() - 1)) == shouldMatch); - } - }; + public boolean isMatch(CharSequence input) { + return input.length() > 0 && (contains(bContent, input.charAt(input.length() - 1)) == shouldMatch); } }; } @@ -602,13 +486,9 @@ public class Rule { return new RPattern() { Pattern pattern = Pattern.compile(regex); - public RMatcher matcher(CharSequence input) { - final Matcher matcher = pattern.matcher(input); - return new RMatcher() { - public boolean find() { - return matcher.find(); - } - }; + public boolean isMatch(CharSequence input) { + Matcher matcher = pattern.matcher(input); + return matcher.find(); } }; } @@ -723,8 +603,8 @@ public class Rule { } boolean patternMatches = input.subSequence(i, ipl).equals(this.pattern); - boolean rContextMatches = this.rContext.matcher(input.subSequence(ipl, input.length())).find(); - boolean lContextMatches = this.lContext.matcher(input.subSequence(0, i)).find(); + boolean rContextMatches = this.rContext.isMatch(input.subSequence(ipl, input.length())); + boolean lContextMatches = this.lContext.isMatch(input.subSequence(0, i)); return patternMatches && rContextMatches && lContextMatches; } Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/BeiderMorseEncoderTest.java Mon Aug 8 22:47:24 2011 @@ -19,8 +19,6 @@ package org.apache.commons.codec.languag import static org.junit.Assert.assertEquals; -import java.util.Random; - import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; import org.apache.commons.codec.StringEncoderAbstractTest; @@ -165,7 +163,7 @@ public class BeiderMorseEncoderTest exte } /** - * (Un)luckily, the worse performing test because of the data in {@link TEST_CHARS} + * (Un)luckily, the worse performing test because of the data in {@link #TEST_CHARS} * * @throws EncoderException */ @@ -183,34 +181,23 @@ public class BeiderMorseEncoderTest exte } } - /** - * Another odd performance edge case. - * - * @throws EncoderException - */ - @Test(/* timeout = 20000L */) - public void testSpeedCheckAZ() throws EncoderException { + @Test + public void testSpeedCheck2() throws EncoderException { BeiderMorseEncoder bmpm = this.createGenericApproxEncoder(); - String phrase = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + String phrase = "ItstheendoftheworldasweknowitandIfeelfine"; + for (int i = 1; i <= phrase.length(); i++) { bmpm.encode(phrase.subSequence(0, i)); } } - /** - * Runs between 1.6 and 13 seconds at length 40 for me (Gary Gregory, 2011/08/06) - * - * @throws EncoderException - */ - @Test(/* timeout = 20000L */) - public void testSpeedCheckRandom() throws EncoderException { + @Test + public void testSpeedCheck3() throws EncoderException { BeiderMorseEncoder bmpm = this.createGenericApproxEncoder(); - StringBuffer stringBuffer = new StringBuffer(); - Random rand = new Random(); - stringBuffer.append(TEST_CHARS[rand.nextInt(TEST_CHARS.length)]); - for (int i = 0; i < 40; i++) { - bmpm.encode(stringBuffer.toString()); - stringBuffer.append(TEST_CHARS[rand.nextInt(TEST_CHARS.length)]); + String phrase = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + + for (int i = 1; i <= phrase.length(); i++) { + bmpm.encode(phrase.subSequence(0, i)); } } } Modified: commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java URL: http://svn.apache.org/viewvc/commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java?rev=1155135&r1=1155134&r2=1155135&view=diff ============================================================================== --- commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java (original) +++ commons/proper/codec/trunk/src/test/org/apache/commons/codec/language/bm/RuleTest.java Mon Aug 8 22:47:24 2011 @@ -80,4 +80,68 @@ public class RuleTest { } } } + + @Test + public void subSequenceWorks() { + // AppendableCharSequence is private to Rule. We can only make it through a Phoneme. + + Rule.Phoneme a = new Rule.Phoneme("a", null); + Rule.Phoneme b = new Rule.Phoneme("b", null); + Rule.Phoneme cd = new Rule.Phoneme("cd", null); + Rule.Phoneme ef = new Rule.Phoneme("ef", null); + Rule.Phoneme ghi = new Rule.Phoneme("ghi", null); + Rule.Phoneme jkl = new Rule.Phoneme("jkl", null); + + assertEquals('a', a.getPhonemeText().charAt(0)); + assertEquals('b', b.getPhonemeText().charAt(0)); + assertEquals('c', cd.getPhonemeText().charAt(0)); + assertEquals('d', cd.getPhonemeText().charAt(1)); + assertEquals('e', ef.getPhonemeText().charAt(0)); + assertEquals('f', ef.getPhonemeText().charAt(1)); + assertEquals('g', ghi.getPhonemeText().charAt(0)); + assertEquals('h', ghi.getPhonemeText().charAt(1)); + assertEquals('i', ghi.getPhonemeText().charAt(2)); + assertEquals('j', jkl.getPhonemeText().charAt(0)); + assertEquals('k', jkl.getPhonemeText().charAt(1)); + assertEquals('l', jkl.getPhonemeText().charAt(2)); + + Rule.Phoneme a_b = a.append(b.getPhonemeText()); + assertEquals('a', a_b.getPhonemeText().charAt(0)); + assertEquals('b', a_b.getPhonemeText().charAt(1)); + assertEquals("ab", a_b.getPhonemeText().subSequence(0, 2).toString()); + assertEquals("a", a_b.getPhonemeText().subSequence(0, 1).toString()); + assertEquals("b", a_b.getPhonemeText().subSequence(1, 2).toString()); + + Rule.Phoneme cd_ef = cd.append(ef.getPhonemeText()); + assertEquals('c', cd_ef.getPhonemeText().charAt(0)); + assertEquals('d', cd_ef.getPhonemeText().charAt(1)); + assertEquals('e', cd_ef.getPhonemeText().charAt(2)); + assertEquals('f', cd_ef.getPhonemeText().charAt(3)); + assertEquals("c", cd_ef.getPhonemeText().subSequence(0, 1).toString()); + assertEquals("d", cd_ef.getPhonemeText().subSequence(1, 2).toString()); + assertEquals("e", cd_ef.getPhonemeText().subSequence(2, 3).toString()); + assertEquals("f", cd_ef.getPhonemeText().subSequence(3, 4).toString()); + assertEquals("cd", cd_ef.getPhonemeText().subSequence(0, 2).toString()); + assertEquals("de", cd_ef.getPhonemeText().subSequence(1, 3).toString()); + assertEquals("ef", cd_ef.getPhonemeText().subSequence(2, 4).toString()); + assertEquals("cde", cd_ef.getPhonemeText().subSequence(0, 3).toString()); + assertEquals("def", cd_ef.getPhonemeText().subSequence(1, 4).toString()); + assertEquals("cdef", cd_ef.getPhonemeText().subSequence(0, 4).toString()); + + Rule.Phoneme a_b_cd = a.append(b.getPhonemeText()).append(cd.getPhonemeText()); + assertEquals('a', a_b_cd.getPhonemeText().charAt(0)); + assertEquals('b', a_b_cd.getPhonemeText().charAt(1)); + assertEquals('c', a_b_cd.getPhonemeText().charAt(2)); + assertEquals('d', a_b_cd.getPhonemeText().charAt(3)); + assertEquals("a", a_b_cd.getPhonemeText().subSequence(0, 1).toString()); + assertEquals("b", a_b_cd.getPhonemeText().subSequence(1, 2).toString()); + assertEquals("c", a_b_cd.getPhonemeText().subSequence(2, 3).toString()); + assertEquals("d", a_b_cd.getPhonemeText().subSequence(3, 4).toString()); + assertEquals("ab", a_b_cd.getPhonemeText().subSequence(0, 2).toString()); + assertEquals("bc", a_b_cd.getPhonemeText().subSequence(1, 3).toString()); + assertEquals("cd", a_b_cd.getPhonemeText().subSequence(2, 4).toString()); + assertEquals("abc", a_b_cd.getPhonemeText().subSequence(0, 3).toString()); + assertEquals("bcd", a_b_cd.getPhonemeText().subSequence(1, 4).toString()); + assertEquals("abcd", a_b_cd.getPhonemeText().subSequence(0, 4).toString()); + } }