For the time being I seem to be able to do this by using a custom
TokenFilterFactory class as follows.
If there is a better approach, or if this approach seems flawed, let me know.
Thanks.
package com.wolfram.textsearch;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.eu.BasqueAnalyzer;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.ga.IrishAnalyzer;
import org.apache.lucene.analysis.gl.GalicianAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hy.ArmenianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.lt.LithuanianAnalyzer;
import org.apache.lucene.analysis.lv.LatvianAnalyzer;
import org.apache.lucene.analysis.ro.RomanianAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.WordlistLoader;
import org.apache.lucene.util.IOUtils;
public class MultiLanguageStopWordFilterFactory extends TokenFilterFactory
{
String language = "English";
private CharArraySet stopWords;
private final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
public MultiLanguageStopWordFilterFactory(Map<String,String> args) throws
IOException
{
super(args);
language = get(args, "language");
if (!args.isEmpty())
{
throw new IllegalArgumentException("Unknown parameters: " + args);
}
int stopwordStyle = 0;
String commentChar = "#";
Class<? extends Analyzer> analyzerClass = null;
String stopwordFile = DEFAULT_STOPWORD_FILE;
switch(language)
{
case "Arabic":
analyzerClass = ArabicAnalyzer.class;
break;
case "Bulgarian":
analyzerClass = BulgarianAnalyzer.class;
break;
case "Catalan":
analyzerClass = CatalanAnalyzer.class;
break;
case "Chinese":
analyzerClass = CJKAnalyzer.class;
break;
case "Japanese":
analyzerClass = CJKAnalyzer.class;
break;
case "Korean":
analyzerClass = CJKAnalyzer.class;
break;
case "KurdishCentral":
analyzerClass = SoraniAnalyzer.class;
break;
case "Czech":
analyzerClass = CzechAnalyzer.class;
break;
case "Danish":
stopwordStyle = 1;
stopwordFile = "danish_stop.txt";
break;
case "German":
stopwordStyle = 1;
stopwordFile = "german_stop.txt";
break;
case "Greek":
analyzerClass = GreekAnalyzer.class;
break;
case "English":
stopwordStyle = 2;
break;
case "Spanish":
stopwordStyle = 1;
stopwordFile = "spanish_stop.txt";
break;
case "Basque":
analyzerClass = BasqueAnalyzer.class;
break;
case "Persian":
analyzerClass = PersianAnalyzer.class;
break;
case "Finnish":
stopwordStyle = 1;
stopwordFile = "finnish_stop.txt";
break;
case "French":
stopwordStyle = 1;
stopwordFile = "french_stop.txt";
break;
case "GaelicIrish":
analyzerClass = IrishAnalyzer.class;
break;
case "Galician":
analyzerClass = GalicianAnalyzer.class;
break;
case "Hindi":
analyzerClass = HindiAnalyzer.class;
break;
case "Hungarian":
stopwordStyle = 1;
stopwordFile = "hungarian_stop.txt";
break;
case "Armenian":
analyzerClass = ArmenianAnalyzer.class;
break;
case "Indonesian":
analyzerClass = IndonesianAnalyzer.class;
break;
case "Italian":
stopwordStyle = 1;
stopwordFile = "italian_stop.txt";
break;
case "Lithuanian":
analyzerClass = LithuanianAnalyzer.class;
break;
case "Latvian":
analyzerClass = LatvianAnalyzer.class;
break;
case "Dutch":
stopwordStyle = 1;
stopwordFile = "dutch_stop.txt";
break;
case "Norwegian":
stopwordStyle = 1;
stopwordFile = "norwegian_stop.txt";
break;
case "Portuguese":
stopwordStyle = 1;
stopwordFile = "portuguese_stop.txt";
break;
case "Romanian":
analyzerClass = RomanianAnalyzer.class;
break;
case "Russian":
stopwordStyle = 1;
stopwordFile = "russian_stop.txt";
break;
case "Swedish":
stopwordStyle = 1;
stopwordFile = "swedish_stop.txt";
break;
case "Thai":
analyzerClass = ThaiAnalyzer.class;
break;
case "Turkish":
analyzerClass = TurkishAnalyzer.class;
break;
}
if (stopwordStyle == 0)
{
stopWords = loadStopwordSet(false, analyzerClass, stopwordFile, commentChar);
}
else if (stopwordStyle == 1)
{
stopWords =
WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(SnowballFilter.class,
stopwordFile, StandardCharsets.UTF_8));
}
else if (stopwordStyle == 2)
{
stopWords = StandardAnalyzer.STOP_WORDS_SET;
}
}
/**
* Load a stop word set.
*
* @param aClass the associated analyzer.
* @param resource the file.
* @param comment the character used in the file to indicate a comment.
*
* @return a set of stopwords.
*
* @throws IOException
*/
static CharArraySet loadStopwordSet(
boolean ignoreCase,
final Class<? extends Analyzer> aClass, final String resource,
final String comment) throws IOException
{
Reader reader = null;
try
{
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource),
StandardCharsets.UTF_8);
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16,
ignoreCase));
}
finally
{
IOUtils.close(reader);
}
}
@Override
public TokenStream create(TokenStream input)
{
StopFilter stopFilter = new StopFilter(input, stopWords);
return stopFilter;
}
}
----- On May 5, 2016, at 2:02 PM, danielb <[email protected]> wrote:
> I'd like to use CustomAnalyzer to create an analyzer that is much like
> the FrenchAnalyzer.
> In doing that, I'm using StopFilterFactory.
> But I'm unsure how to point it to use "french_stop.txt". ie. What
> FrenchAnalyzer is using here:
> public final class FrenchAnalyzer extends StopwordAnalyzerBase {
> public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
> ...
> The typical use of StopFilterFactory:
> .addTokenFilter(StopFilterFactory.class, "ignoreCase", "false", "words",
> "french_stop.txt", "format", "wordset")
> But this looks for a file "french_stop.txt" and can't find it.
> (presumably it's looking in a completely different location from
> FrenchAnalyzer)
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [email protected]
> For additional commands, e-mail: [email protected]