There is a problem with custom tokenizer for Solr. We have developed our own tokenizer for Solr, that he rescued phones from the text and put additional tokens to token stream. But unfortunately, these additional tokens are not indexed by Solr. For an example, the text "Hello (111) 222-33-44 all!" expanded into tokens: "2223344", "1112223344", "71112223344", "81112223344", "hello", "111", "222", "33", "44", "all". The search for tokens "2223344", "1112223344", "71112223344", "81112223344" is not happening. Tell me what could be the cause. We are using Solr 4.3.1. Next are the sources:
public class HJStandardTokenizerFactory extends TokenizerFactory{ private final int maxTokenLength; public HJStandardTokenizerFactory(Map<String, String> args) { super(args); assureMatchVersion(); maxTokenLength = getInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } @Override public Tokenizer create(AttributeSource.AttributeFactory factory, Reader input) { String content = null; HJPhoneNumberHelper hjPhoneNumberHelper = null; StringReader stringReader = null; try { content = IOUtils.toString(input); hjPhoneNumberHelper = new HJPhoneNumberHelper(content); stringReader = new StringReader(content); } catch (IOException e) { } HJStandardTokenizer tokenizer = new HJStandardTokenizer(luceneMatchVersion, factory, stringReader, hjPhoneNumberHelper.getPhoneNumbers()); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; } } public class HJStandardTokenizer extends Tokenizer{ private StandardTokenizerInterface scanner; public static final int ALPHANUM = 0; /** @deprecated (3.1) */ @Deprecated public static final int APOSTROPHE = 1; /** @deprecated (3.1) */ @Deprecated public static final int ACRONYM = 2; /** @deprecated (3.1) */ @Deprecated public static final int COMPANY = 3; public static final int EMAIL = 4; /** @deprecated (3.1) */ @Deprecated public static final int HOST = 5; public static final int NUM = 6; /** @deprecated (3.1) */ @Deprecated public static final int CJ = 7; /** @deprecated (3.1) */ @Deprecated public static final int ACRONYM_DEP = 8; public static final int SOUTHEAST_ASIAN = 9; public static final int IDEOGRAPHIC = 10; public static final int HIRAGANA = 11; public static final int KATAKANA = 12; public static final int HANGUL = 13; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>", "<SOUTHEAST_ASIAN>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>", "<HANGUL>" }; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; private static class PhoneTextPosition { public int position; public int length; public LinkedList<String> variants = new LinkedList<String>(); private PhoneTextPosition(int position, int length, Collection<String> phoneVariants) { this.position = position; this.length = length; this.variants.addAll(phoneVariants); } @Override public int hashCode() { return (new Integer(position).hashCode()); } @Override public boolean equals(Object obj) { if (obj instanceof PhoneTextPosition) { PhoneTextPosition otherObj = (PhoneTextPosition)obj; if (position == otherObj.position && length == otherObj.length) return true; } return false; } } private LinkedList<PhoneTextPosition> phoneVariants; /** Set the max allowed token length. Any token longer * than this is skipped. */ public void setMaxTokenLength(int length) { this.maxTokenLength = length; } /** @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } /** * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches * the <code>input</code> to the newly created JFlex scanner. * * @param input The input reader * * See http://issues.apache.org/jira/browse/LUCENE-1068 */ public HJStandardTokenizer(Version matchVersion, Reader input) { super(input); init(matchVersion); } /** * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory} */ public HJStandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input, Collection<HJPhoneNumber> phones) { super(factory, input); init(matchVersion); phoneVariants = new LinkedList<PhoneTextPosition>(); for (HJPhoneNumber phone : phones) { PhoneTextPosition position = new PhoneTextPosition( phone.getPositionInText(), phone.getLengthInText(), phone.getAllVariants()); phoneVariants.add(position); } } private final void init(Version matchVersion) { this.scanner = new StandardTokenizerImpl(null); } // this tokenizer generates three attributes: // term offset, positionIncrement and type private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); /* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); if (phoneVariants.size() > 0) { PhoneTextPosition p = phoneVariants.peek(); try { String variant = p.variants.poll(); if (StringUtils.isNotEmpty(variant)) { posIncrAtt.setPositionIncrement(1); char[] buf = variant.toCharArray(); termAtt.resizeBuffer(buf.length); termAtt.copyBuffer(buf, 0, buf.length); final int start = p.position; offsetAtt.setOffset(correctOffset(start), correctOffset(start+p.length)); typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[HJStandardTokenizer.NUM]); return true; } } finally { if (p.variants.size() == 0) { phoneVariants.remove(p); } } } int posIncr = 1; while(true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerInterface.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == HJStandardTokenizer.ACRONYM_DEP) { typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[HJStandardTokenizer.HOST]); termAtt.setLength(termAtt.length() - 1); // remove extra '.' } else { typeAtt.setType(HJStandardTokenizer.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } } @Override public final void end() { // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); } @Override public void reset() throws IOException { scanner.yyreset(input); } }