Oh sorry, please ignore my previous code snippet, my intent was: By checking the position increment?
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.*; import java.io.IOException; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; public class MwKStemCollectFilter extends TokenFilter { private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posAttribute = addAttribute(PositionIncrementAttribute.class); private static final Map<String, Set<String>> stemWordMapping = new LinkedHashMap<>(); private static String sPreviousTermString = ""; MwKStemCollectFilter(TokenStream in) { super(in); } /** Returns the next, stemmed, input Token. * @return The stemmed form of a token. * @throws IOException If there is a low-level I/O error. */ @Override public final boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; int positionIncrement = posAttribute.getPositionIncrement(); if (positionIncrement == 0) { String currentStemmedString = termAttribute.toString(); if (sPreviousTermString != null && !sPreviousTermString.isEmpty()) { Set<String> originalStringSet = stemWordMapping.get(currentStemmedString); if (originalStringSet != null) { originalStringSet.add(sPreviousTermString); } else { originalStringSet = new LinkedHashSet<>(); originalStringSet.add(sPreviousTermString); } stemWordMapping.put(currentStemmedString, originalStringSet); } System.out.println("stem->unstemm set: " + stemWordMapping.toString()); } sPreviousTermString = termAttribute.toString(); return true; } } Sincerely, --Xiaolong On Fri, Feb 3, 2017 at 1:16 PM, Xiaolong Zheng <zhengxiaol...@gmail.com> wrote: > Hello, > > I am trying collect stemming changes in my search index during the > indexing time. So I could collect a list of stemmed word -> [variety > original word] (e.g: plot -> [plots, plotting, plotted]) for a later use. > > I am using k-stem filter + KeywordRepeatFilter > + RemoveDuplicatesTokenFilter to produce the tokens. I am wondering what's > the best way to collecting such information? > > I am think by comparing the term buffer, is this the right way to do it? > > > > import org.apache.lucene.analysis.TokenFilter; > import org.apache.lucene.analysis.TokenStream; > import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; > > import java.io.IOException; > import java.util.*; > > public class FilterChangeWrapper extends TokenFilter { > private final TokenFilter fWrappedFilter; > private final CharTermAttribute termAttribute = > addAttribute(CharTermAttribute.class); > private final Map<String,Set<String>> fMappings = new HashMap<>(); > > public FilterChangeWrapper(TokenStream in, TokenFilter wrappedFilter) { > super(in); > fWrappedFilter = wrappedFilter; > } > > @Override > public boolean incrementToken() throws IOException { > char[] startingTerm = termAttribute.buffer(); > boolean result = fWrappedFilter.incrementToken(); > char[] endingTerm = termAttribute.buffer(); > if (!Arrays.equals(startingTerm, endingTerm)) { > addMapping(startingTerm, endingTerm); > } > return result; > } > > private void addMapping(char[] startingTerm, char[] endingTerm) { > String startingString = new String(startingTerm); > String endingString = new String(endingTerm); > if (!fMappings.containsKey(startingString)) { > fMappings.put(startingString, new HashSet<String>()); > } > > fMappings.get(startingString).add(endingString); > } > > public Map<String,Set<String>> getMappings() { > return Collections.unmodifiableMap(fMappings); > } > } > > > > > Thanks, > Xiaolong > >