Hi, I have created a custom analyzer, with a custom tokenizer which takes Antlr tokens from a file and will convert it into lucene tokens by setting them to attribute source.
It works fine if i add one document to index, i am able to search through a query and getting the hits. Problem comes when i add another document, the custom tokenizer still seems to hold the same old reader instance pointing to the end of the file, hence it is not tokenizing the contents of subsequent files added. my document will look something like this: Document doc = ... doc.add(new StringField(FIELD_FILE_PATH, getIndexFilePath(resource), Store.YES)); doc.add(new StringField(FIELD_FILE_TYPE, ifile.getFileExtension().toLowerCase(), Store.YES)); FieldType fieldType = new FieldType(); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorOffsets(true); fieldType.setIndexed(true); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setStoreTermVectorPayloads(true); fieldType.setStoreTermVectorPositions(true); doc.add(new Field(FIELD_CONTENTS, new FileReader(file), fieldType)); My Custom Analyzer: public class FilesAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(new FilesTokenizer(reader)); } } My Tokenizer: public class FilesTokenizer extends Tokenizer { /** * Tokenizer Constants */ public static final String INCLUDE_NAME = "include_name";// TODO just the // name of the // included // file- not the // path relative // to current // propath // TODO also handle case of spaces- "" or '' appears as {my.i} or similar // (including params and other junk) public static final String PROC_NAME = "proc_name";// TODO the name of the // procedure to run- can // this also be given as // path a la include? // TODO also handle case of spaces- "" or '' appears as RUN myProc.p public static final String[] ALL_TOKEN_TYPES = new String[] { INCLUDE_NAME, PROC_NAME }; boolean done = false; private Reader input = null; protected Lexer lexer; //Token term attributes private CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); private TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); private PayloadAttribute payloadAttribute = addAttribute(PayloadAttribute.class); protected FilesTokenizer(Reader in) { super(in); if (in instanceof BufferedReader) { this.input = in; } else this.input = new BufferedReader(in); ASTInfo astInfo = new ASTInfo(null); astInfo.setMonitor(new NullProgressMonitor()); // files etc which is not really needed lexer = new Lexer(input) { @Override public antlr.Token nextToken() throws TokenStreamException { antlr.Token token = super.nextToken(); // TODO move to interface IASTToken- methods // get/setOffset AntlrToken myToken = new AntlrToken(token); myToken.setOffset(totalOffset); return myToken; } @Override public void consume() throws CharStreamException { // TODO Auto-generated method stub super.consume(); currentLineOffset = inputState.getColumn(); } int totalOffset = 0; int currentLineOffset = 0; public antlr.Token getTokenObject() { antlr.Token tokenObject = super.getTokenObject(); return tokenObject; } @Override public void match(char arg0) throws MismatchedCharException, CharStreamException { if (arg0 == '\n') { totalOffset += currentLineOffset; currentLineOffset = 0; } super.match(arg0); } @Override public void newline() { super.newline(); } @Override public void setColumn(int c) { super.setColumn(c); currentLineOffset += c; } }; lexer.setASTInfo(astInfo); } public final TokenType next() throws java.io.IOException {// we are not // interested in all // antlr tokens // - keep consuming antlr tokens till we find a token of interest- // either include or run antlr.Token nextAntlrToken = null; try { nextAntlrToken = lexer.nextToken(); // System.out.println("antlr token:" + nextAntlrToken); int type = nextAntlrToken.getType(); while (type != TokenTypes.EOF) { if (type == ParserTokenTypes.RUN) {// RUN String text = nextAntlrToken.getText(); nextAntlrToken = progressLexer.nextToken(); type = nextAntlrToken.getType(); if (type == TokenTypes.IDENT) { int offset; // TODO move to interface IASTToken- methods // get/setOffset offset = ((AntlrToken) nextAntlrToken) .getOffset(); // TODO handle case of value expression here text = nextAntlrToken.getText(); TokenType token = new TokenType(text, offset, offset + text.length(), PROC_NAME); return token; } } // TODO use proper token type- prob INCLUDE__REF if (type == TokenTypes.IDENT) { // include or run for building a digraph. we would need to // index all identifiers anyway // TODO should we include more info (like global)? this will // make it pretty complicated and will have performance // issues- // we will need to "remember" too many tokens in this case // case 1: include- the identifier is of the form // {/abc/d/e.p "someArg"}- we need to store both /abc/d/e.p // and e.p- note that most customer workspaces dont have // repeating filenames // TODO also handle case of spaces in include name String text = nextAntlrToken.getText(); if (text.startsWith("{")) {// include // TODO handle all possible cases in path- spaces,...- // preferably use regex here if (text.contains("/")) { text = text.substring(text.lastIndexOf('/') + 1); } if (text.indexOf(' ') != -1) { text = text.substring(0, text.indexOf(' ')); } if (text.startsWith("{")) { text = text.substring(1); } if (text.endsWith("}")) { text = text.substring(0, text.length() - 1); } /* * int column = nextAntlrToken.getColumn(); int line = * nextAntlrToken.getLine(); */int lineOffset; lineOffset = ((AntlrToken) nextAntlrToken) .getOffset(); // lineOffset=lineOffset+column-1; lineOffset += nextAntlrToken.getText().indexOf(text); // System.out.println("Offset:" + lineOffset); // System.out.println("text:$" + text + "$"); // {optional "optional /path / name of file" TokenType token = new TokenType(text, lineOffset, lineOffset + text.length(), INCLUDE_NAME); return token; } else { nextAntlrToken = progressLexer.nextToken(); type = nextAntlrToken.getType(); } } else { nextAntlrToken = lexer.nextToken(); type = nextAntlrToken.getType(); } } } catch (TokenStreamException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } @Override public boolean incrementToken() throws IOException { // TODO Auto-generated method stub clearAttributes(); TokenType token = null; if((token = next()) != null){ charTermAttribute.append(token.getTokenText()); offsetAttribute.setOffset(token.getStartOffset(), token.getEndOffset()); typeAttribute.setType(token.getType()); payloadAttribute.setPayload(new BytesRef(token.getType().getBytes())); return true; }else{ return false; } return true; } } Should i be doing a reset for the reader input, am i missing anything here? TIA, Nischal Y