Oooooh, I like that one: public static void getRomanization(Appendable builder, CharSequence s) ... ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
That’s hilarious! Uwe ----- Uwe Schindler H.-H.-Meier-Allee 63, D-28213 Bremen http://www.thetaphi.de eMail: [email protected] > -----Original Message----- > From: [email protected] [mailto:[email protected]] > Sent: Sunday, March 25, 2012 4:17 PM > To: [email protected] > Subject: svn commit: r1305046 - in /lucene/dev/trunk: lucene/contrib/ > modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/ > modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ > modules/analysis/kuromoji/src/test/org/apa... > > Author: cm > Date: Sun Mar 25 14:17:23 2012 > New Revision: 1305046 > > URL: http://svn.apache.org/viewvc?rev=1305046&view=rev > Log: > Added KuromojiReadingFormFilter (LUCENE-3915) > > Added: > > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/KuromojiReadingFormFilter.java (with props) > > lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/anal > ysis/kuromoji/TestKuromojiReadingFormFilter.java (with props) > > lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiReadin > gFormFilterFactory.java (with props) > Modified: > lucene/dev/trunk/lucene/contrib/CHANGES.txt > > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/util/ToStringUtil.java > > Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt > URL: > http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?r > ev=1305046&r1=1305045&r2=1305046&view=diff > ================================================================ > ============== > --- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original) > +++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Sun Mar 25 14:17:23 2012 > @@ -176,6 +176,9 @@ New Features > with/without trailing long vowel marks. The filter is used in both > KuromojiAnalyzer > and the "text_ja" field type in schema.xml. (Christian Moen) > > + * LUCENE-3915: Add Japanese filter to replace a term attribute with its > reading. > + (Koji Sekiguchi, Robert Muir, Christian Moen) > + > * LUCENE-3685: Add ToChildBlockJoinQuery and renamed previous > BlockJoinQuery to ToParentBlockJoinQuery, so that you can now do > joins in both parent to child and child to parent directions. > > Added: > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/KuromojiReadingFormFilter.java > URL: > http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src > /java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java?r > ev=1305046&view=auto > ================================================================ > ============== > --- > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/KuromojiReadingFormFilter.java (added) > +++ > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/KuromojiReadingFormFilter.java Sun Mar 25 14:17:23 2012 > @@ -0,0 +1,65 @@ > +package org.apache.lucene.analysis.kuromoji; > + > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +import org.apache.lucene.analysis.TokenFilter; > +import org.apache.lucene.analysis.TokenStream; > +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; > +import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute; > +import org.apache.lucene.analysis.kuromoji.util.ToStringUtil; > + > +import java.io.IOException; > + > +/** > + * A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term > + * attribute with the reading of a token in either katakana or romaji form. > + * The default reading form is katakana. > + */ > + > +public final class KuromojiReadingFormFilter extends TokenFilter { > + private final CharTermAttribute termAttr = > addAttribute(CharTermAttribute.class); > + private final ReadingAttribute readingAttr = > addAttribute(ReadingAttribute.class); > + > + private boolean useRomaji; > + > + public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) { > + super(input); > + this.useRomaji = useRomaji; > + } > + > + public KuromojiReadingFormFilter(TokenStream input) { > + this(input, false); > + } > + > + @Override > + public boolean incrementToken() throws IOException { > + if (input.incrementToken()) { > + String reading = readingAttr.getReading(); > + if (reading != null) { > + if (useRomaji) { > + ToStringUtil.getRomanization(termAttr.setEmpty(), reading); > + } else { > + termAttr.setEmpty().append(reading); > + } > + } > + return true; > + } else { > + return false; > + } > + } > +} > > Modified: > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/util/ToStringUtil.java > URL: > http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src > /java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java?rev=1305046 > &r1=1305045&r2=1305046&view=diff > ================================================================ > ============== > --- > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/util/ToStringUtil.java (original) > +++ > lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/anal > ysis/kuromoji/util/ToStringUtil.java Sun Mar 25 14:17:23 2012 > @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.kurom > * limitations under the License. > */ > > +import java.io.IOException; > import java.util.HashMap; > > /** > @@ -239,7 +240,19 @@ public class ToStringUtil { > * Romanize katakana with modified hepburn > */ > public static String getRomanization(String s) { > - StringBuilder builder = new StringBuilder(); > + StringBuilder out = new StringBuilder(); > + try { > + getRomanization(out, s); > + } catch (IOException bogus) { > + throw new RuntimeException(bogus); > + } > + return out.toString(); > + } > + > + /** > + * Romanize katakana with modified hepburn > + */ > + public static void getRomanization(Appendable builder, CharSequence s) > throws IOException { > final int len = s.length(); > for (int i = 0; i < len; i++) { > // maximum lookahead: 3 > @@ -1022,6 +1035,5 @@ public class ToStringUtil { > builder.append(ch); > } > } > - return builder.toString(); > } > } > > Added: > lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/anal > ysis/kuromoji/TestKuromojiReadingFormFilter.java > URL: > http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src > /test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.jav > a?rev=1305046&view=auto > ================================================================ > ============== > --- > lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/anal > ysis/kuromoji/TestKuromojiReadingFormFilter.java (added) > +++ > lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/anal > ysis/kuromoji/TestKuromojiReadingFormFilter.java Sun Mar 25 14:17:23 2012 > @@ -0,0 +1,64 @@ > +package org.apache.lucene.analysis.kuromoji; > + > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +import org.apache.lucene.analysis.Analyzer; > +import org.apache.lucene.analysis.BaseTokenStreamTestCase; > +import org.apache.lucene.analysis.Tokenizer; > + > +import java.io.IOException; > +import java.io.Reader; > + > +/** > + * Tests for {@link TestKuromojiReadingFormFilter} > + */ > +public class TestKuromojiReadingFormFilter extends > BaseTokenStreamTestCase { > + private Analyzer katakanaAnalyzer = new Analyzer() { > + @Override > + protected TokenStreamComponents createComponents(String fieldName, > Reader reader) { > + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, > KuromojiTokenizer.Mode.SEARCH); > + return new TokenStreamComponents(tokenizer, new > KuromojiReadingFormFilter(tokenizer, false)); > + } > + }; > + > + private Analyzer romajiAnalyzer = new Analyzer() { > + @Override > + protected TokenStreamComponents createComponents(String fieldName, > Reader reader) { > + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, > KuromojiTokenizer.Mode.SEARCH); > + return new TokenStreamComponents(tokenizer, new > KuromojiReadingFormFilter(tokenizer, true)); > + } > + }; > + > + > + public void testKatakanaReadings() throws IOException { > + assertAnalyzesTo(katakanaAnalyzer, > "今夜ã¯ãƒãƒãƒ¼ãƒˆå…ˆç”Ÿã¨è©±ã—ãŸ", > + new String[] { "コンヤ", "ãƒ", "ãƒãƒãƒ¼ãƒˆ", "センセイ", > "ト", "ãƒãƒŠã‚·", "ã‚¿" } > + ); > + } > + > + public void testRomajiReadings() throws IOException { > + assertAnalyzesTo(romajiAnalyzer, > "今夜ã¯ãƒãƒãƒ¼ãƒˆå…ˆç”Ÿã¨è©±ã—ãŸ", > + new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", > "ta" } > + ); > + } > + > + public void testRandomData() throws IOException { > + checkRandomData(random, katakanaAnalyzer, > 1000*RANDOM_MULTIPLIER); > + checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER); > + } > +} > > Added: > lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiReadin > gFormFilterFactory.java > URL: > http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/ > solr/analysis/KuromojiReadingFormFilterFactory.java?rev=1305046&view=auto > ================================================================ > ============== > --- > lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiReadin > gFormFilterFactory.java (added) > +++ > lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/KuromojiReadin > gFormFilterFactory.java Sun Mar 25 14:17:23 2012 > @@ -0,0 +1,50 @@ > +package org.apache.solr.analysis; > + > +/** > + * Licensed to the Apache Software Foundation (ASF) under one or more > + * contributor license agreements. See the NOTICE file distributed with > + * this work for additional information regarding copyright ownership. > + * The ASF licenses this file to You under the Apache License, Version 2.0 > + * (the "License"); you may not use this file except in compliance with > + * the License. You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +import org.apache.lucene.analysis.TokenStream; > +import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter; > + > +import java.util.Map; > + > +/** > + * Factory for {@link KuromojiReadingFormFilter}. > + * <pre class="prettyprint"> > + * <fieldType name="text_ja" class="solr.TextField"> > + * <analyzer> > + * <tokenizer class="solr.KuromojiTokenizerFactory"/> > + * <filter class="solr.KuromojiReadingFormFilterFactory" > + * useRomaji="false"/> > + * </analyzer> > + * </fieldType> > + * </pre> > + */ > +public class KuromojiReadingFormFilterFactory extends > BaseTokenFilterFactory { > + private static final String ROMAJI_PARAM = "useRomaji"; > + private boolean useRomaji; > + > + @Override > + public void init(Map<String, String> args) { > + super.init(args); > + useRomaji = getBoolean(ROMAJI_PARAM, false); > + } > + > + public TokenStream create(TokenStream input) { > + return new KuromojiReadingFormFilter(input, useRomaji); > + } > +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
