RE: Query parser question

Jeroen Lauwers Fri, 20 Jun 2008 13:50:13 -0700

See if this helps ...
You can use your own set for  "WordDelimitingChars" and the  "IgnoreChars"



    public class CustomAnalyzer : Lucene.Net.Analysis.Analyzer
    {
        public CustomAnalyzer()
            : base()
        { }
        public override Lucene.Net.Analysis.TokenStream TokenStream(String 
fieldName, System.IO.TextReader reader)
        {
            return new CustomCharTokenizer(reader);
        }
    }

    /// <summary>
    /// Customization of CharTokenizer for private use.
    /// </summary>
    public class CustomCharTokenizer : Lucene.Net.Analysis.Tokenizer
    {
        /// <summary>
        /// Variables used to process the input token stream
        /// </summary>
        private int offset = 0, bufferIndex = 0, dataLen = 0;
        private const int MAX_WORD_LEN = 255;
        private const int IO_BUFFER_SIZE = 1024;
        private char[] buffer = new char[MAX_WORD_LEN];
        private char[] ioBuffer = new char[IO_BUFFER_SIZE];
        private System.Collections.Generic.List<Char> WordDelimitingChars = new 
System.Collections.Generic.List<Char>(new Char[] { ' ', '-' });
        private System.Collections.Generic.List<Char> IgnoreChars = new 
System.Collections.Generic.List<Char>(new Char[] { '(', ')', '"' });

        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="input"></param>
        public CustomCharTokenizer(System.IO.TextReader input)
            : base(input)
        { }
        /// <summary>Returns true if a character should be included in a token. 
 This
        /// tokenizer generates as tokens adjacent sequences of characters which
        /// satisfy this predicate.  Characters for which this is false are 
used to
        /// define token boundaries and are not included in tokens.
        /// </summary>
        protected internal bool IsTokenChar(char c)
        {
            if (WordDelimitingChars.Contains(c))
                return false;
            else
                return true;
        }
        /// <summary>
        /// Returns the base char(s) after normailzation.
        /// Perfomed actions:
        ///     1. To lower case
        ///     2. Normalize (KD: may result in multiple chars)
        ///     3. Skip diacritics ( +/- = non-spacing marks) (For more info 
see www.unicode.org)
        ///     4. Skip "chars to be ignored"
        /// </summary>
        /// <param name="c">Char to normalize</param>
        /// <returns></returns>
        public Char[] NormalizeChar(Char c)
        {
            // to lower case
            c = Char.ToLower(c);
            // normalize (KD)
            StringBuilder ResultBuilder = new StringBuilder(new string(c, 
1).Normalize(NormalizationForm.FormKD));
            Int32 i = 0;
            while (i < ResultBuilder.Length)
            {
                // remove diacritics
                if (Char.GetUnicodeCategory(ResultBuilder[i]) == 
System.Globalization.UnicodeCategory.NonSpacingMark)
                    ResultBuilder.Remove(i, 1);
                else
                    //remove chars that should be ignored
                    if (IgnoreChars.Contains(ResultBuilder[i]))
                        ResultBuilder.Remove(i, 1);
                    else
                        i++;
            }
            return ResultBuilder.ToString().ToCharArray();
        }
        /// <summary>Returns the next token in the stream, or null at EOS. 
</summary>
        public override Lucene.Net.Analysis.Token Next()
        {
            int sourceLength = 0;
            int tokenLength = 0;
            int start = offset;
            while (true)
            {
                char c;
                // advance position in main tokenstream
                offset++;
                // dataLen:
                //   = length of data from last read operation.
                //   can be smaller then length of ioBuffer (eg.: not enough 
bytes to read at end of stream)
                // bufferIndex:
                //   = current position in ioBuffer
                //
                // read data when necessary
                if (bufferIndex >= dataLen)
                {
                    dataLen = input.Read((System.Char[])ioBuffer, 0, 
ioBuffer.Length);
                    bufferIndex = 0;
                }
                // Check if there is still data to be processed.
                // if last read operation did not read any bytes:
                //     a. break loop and return whatever was buffered
                //     b. if no data was buffered: exit function and return null
                if (dataLen <= 0)
                {
                    if (sourceLength > 0)
                        break;
                    else
                        return null;
                }
                // get next char from ioBuffer
                c = ioBuffer[bufferIndex++];
                // normalize token chars
                if (IsTokenChar(c))
                {
                    // start of token in stream
                    if (sourceLength == 0)
                        start = offset - 1;
                    // length of token in stream
                    sourceLength += 1;
                    Char[] NormChars;
                    NormChars = NormalizeChar(c);
                    foreach (Char nc in NormChars)
                    {
                        // actual length of token
                        buffer[tokenLength] = nc;
                        tokenLength += 1;
                    }
                    // buffer overflow!
                    if (tokenLength >= MAX_WORD_LEN)
                        throw new Exception("Token exceeds maximum word length 
of " + MAX_WORD_LEN.ToString());
                }
                else
                    if (sourceLength > 0)
                        break;
            }
            // return normalized string as token
            return new Lucene.Net.Analysis.Token(new System.String(buffer, 0, 
tokenLength), start, start + sourceLength);
        }
    }


You can test the result with something like this ...

        private void button1_Click(object sender, EventArgs e)
        {
            String input = "1-a (a) \" - \"àáâãäåæçèé-êëìíîïïðñ";
            CustomCharTokenizer tokz = new CustomCharTokenizer(new 
System.IO.StringReader(input));
            while (true)
            {
                Lucene.Net.Analysis.Token t = tokz.Next();
                if (t == null)
                    break;
                MessageBox.Show(t.TermText());
            }
        }
________________________________________
From: Karl Geppert [EMAIL PROTECTED]
Sent: Friday, June 20, 2008 5:18 AM
To: lucene-net-user@incubator.apache.org
Subject: Query parser question

Hi,

Is there any easy way to make a string like methyloctane match
2-methyloctane in the index that was built with the standard anayzer?

I can't see any obvious way to do this, except to modify the standard
analyzer to store this as [2][methyloctane]

Karl


________________________________________________________________________

This email has been scanned for all viruses by the MessageLabs SkyScan
service. For more information on a proactive anti-virus service working
around the clock, around the globe, visit http://www.messagelabs.com
________________________________________________________________________

RE: Query parser question

Reply via email to