Hi All, Objective: I want to create a filter to generate multiple tokens (mentioned below) of input stream and I want to put all generated tokens at same position i.e. 1.
Although there is already a tokenizer (PathHierarchyTokenizerFactory) for similar purpose but I also want my tokens to be stemmed so to achieve my objective I created a filter, please look at the source code below (I am not an Java expert, so code may not be optimized): // File: ExtendedNameFilter.java // Purpose: To combine multiple tokens such that "apache solr foundation" generates tokens "apachsolrfoundat", "solrfoundat", "foundat" package org.apache.lucene.analysis; import java.io.IOException; import java.util.LinkedList; import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; public final class ExtendedNameFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private PositionIncrementAttribute posIncAttr; private OffsetAttribute setOffsetAttr; private final int extendedWordCount; public ExtendedNameFilter(Version matchVersion, TokenStream in, int extendedWordCount) { super(in); CharacterUtils.getInstance(matchVersion); this.extendedWordCount = extendedWordCount; this.posIncAttr = addAttribute(PositionIncrementAttribute.class); this.setOffsetAttr = addAttribute(OffsetAttribute.class); } LinkedList<String> list = new LinkedList<String>(); ArrayList<Integer> startOffsetList = new ArrayList<Integer>(); int endOffset = 0; int count = 0; @Override public final boolean incrementToken() throws IOException { Iterator<String> iterator; int len = 0; while(input.incrementToken()) { list.add(termAtt.toString()); startOffsetList.add(setOffsetAttr.startOffset()); endOffset = setOffsetAttr.endOffset(); } iterator = list.iterator(); len = list.size(); if (len > 0 && (extendedWordCount < 0 || count < extendedWordCount)) { generateToken(iterator); return true; } else { return false; } } public void generateToken(Iterator<String> iterator) { termAtt.setEmpty(); while (iterator.hasNext()){ termAtt.append((CharSequence) iterator.next()); } list.removeFirst(); if(count == 0) { posIncAttr.setPositionIncrement(1); } else { posIncAttr.setPositionIncrement(0); } setOffsetAttr.setOffset(startOffsetList.get(count),endOffset); count++; } } // Code Ends On analysis page of solr it worked fine, I've shared screenshot of analysis page on google, anyone can see this by click on below link https://docs.google.com/file/d/0BxNUkIJt2ma3TUN0YUF1dW1Pc2s/edit?usp=sharing<https://docs.google.com/file/d/0BxNUkIJt2ma3SEE2SDBLTkpETE0/edit?usp=sharing> but while indexing documents Solr gives following exception: Apr 6, 2013 12:05:45 PM org.apache.solr.common.SolrException log SEVERE: java.lang.IllegalArgumentException: first position increment must be > 0 (got 0) at org.apache.lucene.index.DocInverterPerField.processFields(DocInverterPerField.java:125) at org.apache.lucene.index.DocFieldProcessor.processDocument(DocFieldProcessor.java:254) at org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:256) at org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:376) at org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1473) at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:206) at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69) at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51) at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:477) at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346) at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100) at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:246) at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:173) at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:92) at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1797) at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:637) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:343) at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141) at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307) at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137) at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560) at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231) at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072) at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382) at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193) at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006) at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135) at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255) at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154) at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116) at org.eclipse.jetty.server.Server.handle(Server.java:365) at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485) at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53) at org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:926) at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:988) at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:642) at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:235) at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72) at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264) at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608) at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543) at java.lang.Thread.run(Unknown Source) am I doing something wrong in code, please guide to overcome this exception. I am also not clear if it is related to the blank starting tokens in the last filter chain output. -- Regards Abhishek Pratap Singh