Hello, I am trying collect stemming changes in my search index during the indexing time. So I could collect a list of stemmed word -> [variety original word] (e.g: plot -> [plots, plotting, plotted]) for a later use.
I am using k-stem filter + KeywordRepeatFilter + RemoveDuplicatesTokenFilter to produce the tokens. I am wondering what's the best way to collecting such information? I am think by comparing the term buffer, is this the right way to do it? import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.IOException; import java.util.*; public class FilterChangeWrapper extends TokenFilter { private final TokenFilter fWrappedFilter; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final Map<String,Set<String>> fMappings = new HashMap<>(); public FilterChangeWrapper(TokenStream in, TokenFilter wrappedFilter) { super(in); fWrappedFilter = wrappedFilter; } @Override public boolean incrementToken() throws IOException { char[] startingTerm = termAttribute.buffer(); boolean result = fWrappedFilter.incrementToken(); char[] endingTerm = termAttribute.buffer(); if (!Arrays.equals(startingTerm, endingTerm)) { addMapping(startingTerm, endingTerm); } return result; } private void addMapping(char[] startingTerm, char[] endingTerm) { String startingString = new String(startingTerm); String endingString = new String(endingTerm); if (!fMappings.containsKey(startingString)) { fMappings.put(startingString, new HashSet<String>()); } fMappings.get(startingString).add(endingString); } public Map<String,Set<String>> getMappings() { return Collections.unmodifiableMap(fMappings); } } Thanks, Xiaolong