Hello All, I am trying to develop custom tokeniser (please find code below) and found some issue while adding multiple document one after another.
it works fine when i add first document and when i add another document it's not calling "create" method from SampleTokeniserFactory.java but it calls directly reset method and then call incrementToken(). any one have an idea on this what's wrong in the code below? please share your thoughts on this. here is the class which extends TokeniserFactory class === SampleTokeniserFactory.java public class SampleTokeniserFactory extends TokenizerFactory { public SampleTokeniserFactory(Map<String, String> args) { super(args); } public SampleTokeniser create(AttributeFactory factory, Reader reader) { return new SampleTokeniser(factory, reader); } } here is the class which extends Tokenizer class ==== package ns.solr.analyser; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; public class SampleTokeniser extends Tokenizer { private List<Token> tokenList = new ArrayList<Token>(); int tokenCounter = -1; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** * Object that defines the offset attribute */ private final OffsetAttribute offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); /** * Object that defines the position attribute */ private final PositionIncrementAttribute position = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); public SampleTokeniser(AttributeFactory factory, Reader reader) { super(factory, reader); String textToProcess = null; try { textToProcess = readFully(reader); processText(textToProcess); } catch (IOException e) { e.printStackTrace(); } } public String readFully(Reader reader) throws IOException { char[] arr = new char[8 * 1024]; // 8K at a time StringBuffer buf = new StringBuffer(); int numChars; while ((numChars = reader.read(arr, 0, arr.length)) > 0) { buf.append(arr, 0, numChars); } return buf.toString(); } public void processText(String textToProcess) { String wordsList[] = textToProcess.split(" "); int startOffset = 0, endOffset = 0; for (String word : wordsList) { endOffset = word.length(); Token aToken = new Token("Token." + word, startOffset, endOffset); aToken.setPositionIncrement(1); tokenList.add(aToken); startOffset = endOffset + 1; } } @Override public boolean incrementToken() throws IOException { clearAttributes(); tokenCounter++; if (tokenCounter < tokenList.size()) { Token aToken = tokenList.get(tokenCounter); termAtt.append(aToken); termAtt.setLength(aToken.length()); offsetAttribute.setOffset(correctOffset(aToken.startOffset()), correctOffset(aToken.endOffset())); position.setPositionIncrement(aToken.getPositionIncrement()); return true; } return false; } /** * close object * * @throws IOException */ public void close() throws IOException { super.close(); System.out.println("Close method called"); } /** * called when end method gets called * * @throws IOException */ public void end() throws IOException { super.end(); // setting final offset System.out.println("end called with final offset"); } /** * method reset the record * * @throws IOException */ public void reset() throws IOException { super.reset(); System.out.println("Reset Called"); tokenCounter = -1; } }