New Version WordlistLoader.cs

Landrix SE - Sven Harazim Tue, 21 Oct 2008 05:13:15 -0700

I have updated WordlistLoader.cs to the current HeadRevision 

$Id: WordlistLoader.java 706342 2008-10-20 17:19:29Z gsingers $


witch support comments


Sven Harazim

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System;

namespace Lucene.Net.Analysis
{
        
        /// <summary> Loader for text files that represent a list of stopwords.
        /// 
        /// 
        /// </summary>
  /// <version>  $Id: WordlistLoader.java 706342 2008-10-20 17:19:29Z gsingers $
        /// </version>
    public class WordlistLoader
    {
                
        /// <summary> Loads a text file and adds every line as an entry to a 
HashSet (omitting
        /// leading and trailing whitespace). Every line of the file should 
contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// 
        /// </summary>
        /// <param name="wordfile">File containing the wordlist
        /// </param>
        /// <returns> A HashSet with the file's words
        /// </returns>
        public static System.Collections.Hashtable 
GetWordSet(System.IO.FileInfo wordfile)
        {
            System.Collections.Hashtable result = new 
System.Collections.Hashtable();
            System.IO.TextReader reader = null;
            try
            {
                reader = new System.IO.StreamReader(wordfile.FullName, 
System.Text.Encoding.Default);
                result = GetWordSet(reader);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
            return result;
        }

        /// <summary>Loads a text file and adds every non-comment line as an 
entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the file should 
contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// 
        /// </summary>
        /// <param name="wordfile">File containing the wordlist
        /// <param name="comment">The comment string to ignore
        /// </param>
        /// <returns> A HashSet with the file's words
        /// </returns>
        public static System.Collections.Hashtable 
GetWordSet(System.IO.FileInfo wordfile, String comment)
        {
            System.Collections.Hashtable result = new 
System.Collections.Hashtable();
            System.IO.TextReader reader = null;
            try
            {
                reader = new System.IO.StreamReader(wordfile.FullName, 
System.Text.Encoding.Default);
                result = GetWordSet(reader, comment);
            }
            finally
            {
                if (reader != null)
                    reader.Close();
            }
            return result;
        }

        /// <summary> Reads lines from a Reader and adds every line as an entry 
to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the Reader should 
contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// 
        /// </summary>
        /// <param name="reader">Reader containing the wordlist
        /// </param>
        /// <returns> A HashSet with the reader's words
        /// </returns>
        public static System.Collections.Hashtable 
GetWordSet(System.IO.TextReader reader)
        {
            System.Collections.Hashtable result = new 
System.Collections.Hashtable();
            System.IO.TextReader br = null;
            try
            {
                br = (System.IO.TextReader) reader;
                System.String word = null;
                while ((word = br.ReadLine()) != null)
                {
                    System.String tmp = word.Trim();
                    result.Add(tmp, tmp);
                }
            }
            finally
            {
                if (br != null)
                    br.Close();
            }
            return result;
        }
                
        /// <summary>Reads lines from a Reader and adds every non-comment line 
as an entry to a HashSet (omitting
        /// leading and trailing whitespace). Every line of the Reader should 
contain only
        /// one word. The words need to be in lowercase if you make use of an
        /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
        /// 
        /// </summary>
        /// <param name="reader">Reader containing the wordlist
        /// <param name="comment">The string representing a comment.
        /// </param>
        /// <returns> A HashSet with the reader's words
        /// </returns>
        public static System.Collections.Hashtable 
GetWordSet(System.IO.TextReader reader, String comment)
        {
            System.Collections.Hashtable result = new 
System.Collections.Hashtable();
            System.IO.TextReader br = null;
            try
            {
                br = (System.IO.TextReader) reader;
                System.String word = null;
                while ((word = br.ReadLine()) != null)
                {
                    System.String tmp = word.Trim();
                    if (tmp.StartsWith(comment) == false)
                      result.Add(tmp, tmp);
                }
            }
            finally
            {
                if (br != null)
                    br.Close();
            }
            return result;
        }

        /// <summary> Reads a stem dictionary. Each line contains:
        /// <pre>word<b>\t</b>stem</pre>
        /// (i.e. two tab seperated words)
        /// 
        /// </summary>
        /// <returns> stem dictionary that overrules the stemming algorithm
        /// </returns>
        /// <throws>  IOException  </throws>
        public static System.Collections.Hashtable 
GetStemDict(System.IO.FileInfo wordstemfile)
        {
            if (wordstemfile == null)
                throw new System.NullReferenceException("wordstemfile may not 
be null");
            System.Collections.Hashtable result = new 
System.Collections.Hashtable();
            System.IO.StreamReader br = null;
            System.IO.StreamReader fr = null;
            try
            {
                fr = new System.IO.StreamReader(wordstemfile.FullName, 
System.Text.Encoding.Default);
                br = new System.IO.StreamReader(fr.BaseStream, 
fr.CurrentEncoding);
                System.String line;
                char[] tab = {'\t'};
                while ((line = br.ReadLine()) != null)
                {
                    System.String[] wordstem = line.Split(tab, 2);
                    result[wordstem[0]] = wordstem[1];
                }
            }
            finally
            {
                if (fr != null)
                    fr.Close();
                if (br != null)
                    br.Close();
            }
            return result;
        }
    }
}

New Version WordlistLoader.cs

Reply via email to