See if this helps ... You can use your own set for "WordDelimitingChars" and the "IgnoreChars"
public class CustomAnalyzer : Lucene.Net.Analysis.Analyzer { public CustomAnalyzer() : base() { } public override Lucene.Net.Analysis.TokenStream TokenStream(String fieldName, System.IO.TextReader reader) { return new CustomCharTokenizer(reader); } } /// <summary> /// Customization of CharTokenizer for private use. /// </summary> public class CustomCharTokenizer : Lucene.Net.Analysis.Tokenizer { /// <summary> /// Variables used to process the input token stream /// </summary> private int offset = 0, bufferIndex = 0, dataLen = 0; private const int MAX_WORD_LEN = 255; private const int IO_BUFFER_SIZE = 1024; private char[] buffer = new char[MAX_WORD_LEN]; private char[] ioBuffer = new char[IO_BUFFER_SIZE]; private System.Collections.Generic.List<Char> WordDelimitingChars = new System.Collections.Generic.List<Char>(new Char[] { ' ', '-' }); private System.Collections.Generic.List<Char> IgnoreChars = new System.Collections.Generic.List<Char>(new Char[] { '(', ')', '"' }); /// <summary> /// Constructor /// </summary> /// <param name="input"></param> public CustomCharTokenizer(System.IO.TextReader input) : base(input) { } /// <summary>Returns true if a character should be included in a token. This /// tokenizer generates as tokens adjacent sequences of characters which /// satisfy this predicate. Characters for which this is false are used to /// define token boundaries and are not included in tokens. /// </summary> protected internal bool IsTokenChar(char c) { if (WordDelimitingChars.Contains(c)) return false; else return true; } /// <summary> /// Returns the base char(s) after normailzation. /// Perfomed actions: /// 1. To lower case /// 2. Normalize (KD: may result in multiple chars) /// 3. Skip diacritics ( +/- = non-spacing marks) (For more info see www.unicode.org) /// 4. Skip "chars to be ignored" /// </summary> /// <param name="c">Char to normalize</param> /// <returns></returns> public Char[] NormalizeChar(Char c) { // to lower case c = Char.ToLower(c); // normalize (KD) StringBuilder ResultBuilder = new StringBuilder(new string(c, 1).Normalize(NormalizationForm.FormKD)); Int32 i = 0; while (i < ResultBuilder.Length) { // remove diacritics if (Char.GetUnicodeCategory(ResultBuilder[i]) == System.Globalization.UnicodeCategory.NonSpacingMark) ResultBuilder.Remove(i, 1); else //remove chars that should be ignored if (IgnoreChars.Contains(ResultBuilder[i])) ResultBuilder.Remove(i, 1); else i++; } return ResultBuilder.ToString().ToCharArray(); } /// <summary>Returns the next token in the stream, or null at EOS. </summary> public override Lucene.Net.Analysis.Token Next() { int sourceLength = 0; int tokenLength = 0; int start = offset; while (true) { char c; // advance position in main tokenstream offset++; // dataLen: // = length of data from last read operation. // can be smaller then length of ioBuffer (eg.: not enough bytes to read at end of stream) // bufferIndex: // = current position in ioBuffer // // read data when necessary if (bufferIndex >= dataLen) { dataLen = input.Read((System.Char[])ioBuffer, 0, ioBuffer.Length); bufferIndex = 0; } // Check if there is still data to be processed. // if last read operation did not read any bytes: // a. break loop and return whatever was buffered // b. if no data was buffered: exit function and return null if (dataLen <= 0) { if (sourceLength > 0) break; else return null; } // get next char from ioBuffer c = ioBuffer[bufferIndex++]; // normalize token chars if (IsTokenChar(c)) { // start of token in stream if (sourceLength == 0) start = offset - 1; // length of token in stream sourceLength += 1; Char[] NormChars; NormChars = NormalizeChar(c); foreach (Char nc in NormChars) { // actual length of token buffer[tokenLength] = nc; tokenLength += 1; } // buffer overflow! if (tokenLength >= MAX_WORD_LEN) throw new Exception("Token exceeds maximum word length of " + MAX_WORD_LEN.ToString()); } else if (sourceLength > 0) break; } // return normalized string as token return new Lucene.Net.Analysis.Token(new System.String(buffer, 0, tokenLength), start, start + sourceLength); } } You can test the result with something like this ... private void button1_Click(object sender, EventArgs e) { String input = "1-a (a) \" - \"àáâãäåæçèé-êëìíîïïðñ"; CustomCharTokenizer tokz = new CustomCharTokenizer(new System.IO.StringReader(input)); while (true) { Lucene.Net.Analysis.Token t = tokz.Next(); if (t == null) break; MessageBox.Show(t.TermText()); } } ________________________________________ From: Karl Geppert [EMAIL PROTECTED] Sent: Friday, June 20, 2008 5:18 AM To: lucene-net-user@incubator.apache.org Subject: Query parser question Hi, Is there any easy way to make a string like methyloctane match 2-methyloctane in the index that was built with the standard anayzer? I can't see any obvious way to do this, except to modify the standard analyzer to store this as [2][methyloctane] Karl ________________________________________________________________________ This email has been scanned for all viruses by the MessageLabs SkyScan service. For more information on a proactive anti-virus service working around the clock, around the globe, visit http://www.messagelabs.com ________________________________________________________________________