For the time being I seem to be able to do this by using a custom TokenFilterFactory class as follows.
If there is a better approach, or if this approach seems flawed, let me know. Thanks. package com.wolfram.textsearch; import java.io.IOException; import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer; import org.apache.lucene.analysis.ca.CatalanAnalyzer; import org.apache.lucene.analysis.cjk.CJKAnalyzer; import org.apache.lucene.analysis.ckb.SoraniAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.cz.CzechAnalyzer; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.eu.BasqueAnalyzer; import org.apache.lucene.analysis.fa.PersianAnalyzer; import org.apache.lucene.analysis.ga.IrishAnalyzer; import org.apache.lucene.analysis.gl.GalicianAnalyzer; import org.apache.lucene.analysis.hi.HindiAnalyzer; import org.apache.lucene.analysis.hy.ArmenianAnalyzer; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.analysis.lt.LithuanianAnalyzer; import org.apache.lucene.analysis.lv.LatvianAnalyzer; import org.apache.lucene.analysis.ro.RomanianAnalyzer; import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.IOUtils; public class MultiLanguageStopWordFilterFactory extends TokenFilterFactory { String language = "English"; private CharArraySet stopWords; private final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; public MultiLanguageStopWordFilterFactory(Map<String,String> args) throws IOException { super(args); language = get(args, "language"); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } int stopwordStyle = 0; String commentChar = "#"; Class<? extends Analyzer> analyzerClass = null; String stopwordFile = DEFAULT_STOPWORD_FILE; switch(language) { case "Arabic": analyzerClass = ArabicAnalyzer.class; break; case "Bulgarian": analyzerClass = BulgarianAnalyzer.class; break; case "Catalan": analyzerClass = CatalanAnalyzer.class; break; case "Chinese": analyzerClass = CJKAnalyzer.class; break; case "Japanese": analyzerClass = CJKAnalyzer.class; break; case "Korean": analyzerClass = CJKAnalyzer.class; break; case "KurdishCentral": analyzerClass = SoraniAnalyzer.class; break; case "Czech": analyzerClass = CzechAnalyzer.class; break; case "Danish": stopwordStyle = 1; stopwordFile = "danish_stop.txt"; break; case "German": stopwordStyle = 1; stopwordFile = "german_stop.txt"; break; case "Greek": analyzerClass = GreekAnalyzer.class; break; case "English": stopwordStyle = 2; break; case "Spanish": stopwordStyle = 1; stopwordFile = "spanish_stop.txt"; break; case "Basque": analyzerClass = BasqueAnalyzer.class; break; case "Persian": analyzerClass = PersianAnalyzer.class; break; case "Finnish": stopwordStyle = 1; stopwordFile = "finnish_stop.txt"; break; case "French": stopwordStyle = 1; stopwordFile = "french_stop.txt"; break; case "GaelicIrish": analyzerClass = IrishAnalyzer.class; break; case "Galician": analyzerClass = GalicianAnalyzer.class; break; case "Hindi": analyzerClass = HindiAnalyzer.class; break; case "Hungarian": stopwordStyle = 1; stopwordFile = "hungarian_stop.txt"; break; case "Armenian": analyzerClass = ArmenianAnalyzer.class; break; case "Indonesian": analyzerClass = IndonesianAnalyzer.class; break; case "Italian": stopwordStyle = 1; stopwordFile = "italian_stop.txt"; break; case "Lithuanian": analyzerClass = LithuanianAnalyzer.class; break; case "Latvian": analyzerClass = LatvianAnalyzer.class; break; case "Dutch": stopwordStyle = 1; stopwordFile = "dutch_stop.txt"; break; case "Norwegian": stopwordStyle = 1; stopwordFile = "norwegian_stop.txt"; break; case "Portuguese": stopwordStyle = 1; stopwordFile = "portuguese_stop.txt"; break; case "Romanian": analyzerClass = RomanianAnalyzer.class; break; case "Russian": stopwordStyle = 1; stopwordFile = "russian_stop.txt"; break; case "Swedish": stopwordStyle = 1; stopwordFile = "swedish_stop.txt"; break; case "Thai": analyzerClass = ThaiAnalyzer.class; break; case "Turkish": analyzerClass = TurkishAnalyzer.class; break; } if (stopwordStyle == 0) { stopWords = loadStopwordSet(false, analyzerClass, stopwordFile, commentChar); } else if (stopwordStyle == 1) { stopWords = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class, stopwordFile, StandardCharsets.UTF_8)); } else if (stopwordStyle == 2) { stopWords = StandardAnalyzer.STOP_WORDS_SET; } } /** * Load a stop word set. * * @param aClass the associated analyzer. * @param resource the file. * @param comment the character used in the file to indicate a comment. * * @return a set of stopwords. * * @throws IOException */ static CharArraySet loadStopwordSet( boolean ignoreCase, final Class<? extends Analyzer> aClass, final String resource, final String comment) throws IOException { Reader reader = null; try { reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8); return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase)); } finally { IOUtils.close(reader); } } @Override public TokenStream create(TokenStream input) { StopFilter stopFilter = new StopFilter(input, stopWords); return stopFilter; } } ----- On May 5, 2016, at 2:02 PM, danielb <dani...@wolfram.com> wrote: > I'd like to use CustomAnalyzer to create an analyzer that is much like > the FrenchAnalyzer. > In doing that, I'm using StopFilterFactory. > But I'm unsure how to point it to use "french_stop.txt". ie. What > FrenchAnalyzer is using here: > public final class FrenchAnalyzer extends StopwordAnalyzerBase { > public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt"; > ... > The typical use of StopFilterFactory: > .addTokenFilter(StopFilterFactory.class, "ignoreCase", "false", "words", > "french_stop.txt", "format", "wordset") > But this looks for a file "french_stop.txt" and can't find it. > (presumably it's looking in a completely different location from > FrenchAnalyzer) > --------------------------------------------------------------------- > To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org > For additional commands, e-mail: java-user-h...@lucene.apache.org