According to Patrick:
> I've also noticed that if you searched for "finding my email", and
> "my" is in the "badwords" file, it will also be highlighted in the
> excerpt.  May I ask if it is possible that you also append this
> into the patch?  
> 
> Currently, htdig/htsearch truly ignore words in the "badwords" file,
> but they still come up in the excerpt highlighted.

Thanks for the tip.  Here's a revised patch that will make htsearch
ignore words in excerpts that are in the bad_word_list file, or that
are shorter than minimum_word_length, in addition to skipping over
punctuation:

--- htdig-3.1.2.bak/htlib/StringMatch.h Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.h     Mon Aug 23 15:38:31 1999
@@ -98,6 +98,12 @@ public:
     void               IgnoreCase();
 
     //
+    // Build a local translation table which ignores all given punctuation
+    // characters
+    //
+    void               IgnorePunct(char *punct = NULL);
+
+    //
     // Determine if there is a pattern associated with this Match object.
     //
     int                        hasPattern()            {return table[0] != 0;}
--- htdig-3.1.2.bak/htlib/StringMatch.cc        Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.cc    Mon Aug 23 16:40:14 1999
@@ -90,6 +90,8 @@ StringMatch::Pattern(char *pattern, char
        table[i] = new int[n];
        memset((unsigned char *) table[i], 0, n * sizeof(int));
     }
+    for (i = 0; i < n; i++)
+       table[0][i] = i;        // "no-op" states for null char, to be ignored
 
     //
     // Set up a standard case translation table if needed.
@@ -127,6 +129,11 @@ StringMatch::Pattern(char *pattern, char
 #endif
 
        chr = trans[(unsigned char)*pattern];
+       if (chr == 0)
+       {
+           pattern++;
+           continue;
+       }
        if (chr == sep)
        {
            //
@@ -504,12 +511,39 @@ void StringMatch::TranslationTable(char 
 //
 void StringMatch::IgnoreCase()
 {
-    if (local_alloc)
-        delete [] trans;
-    trans = new unsigned char[256];
+    if (!local_alloc || !trans)
+    {
+       trans = new unsigned char[256];
+       for (int i = 0; i < 256; i++)
+           trans[i] = (unsigned char)i;
+       local_alloc = 1;
+    }
     for (int i = 0; i < 256; i++)
-       trans[i] = tolower((unsigned char)i);
-    local_alloc = 1;
+       if (isupper((unsigned char)i))
+           trans[i] = tolower((unsigned char)i);
+}
+
+
+//*****************************************************************************
+// void StringMatch::IgnorePunct(char *punct)
+//   Set up the character translation table to ignore punctuation
+//
+void StringMatch::IgnorePunct(char *punct)
+{
+    if (!local_alloc || !trans)
+    {
+       trans = new unsigned char[256];
+       for (int i = 0; i < 256; i++)
+           trans[i] = (unsigned char)i;
+       local_alloc = 1;
+    }
+    if (punct)
+       for (int i = 0; punct[i]; i++)
+           trans[(unsigned char)punct[i]] = 0;
+    else
+       for (int i = 0; i < 256; i++)
+           if (HtIsWordChar(i) && !HtIsStrictWordChar(i))
+               trans[i] = 0;
 }
 
 
--- htdig-3.1.2.bak/htsearch/htsearch.cc        Wed Aug 18 16:40:30 1999
+++ htdig-3.1.2/htsearch/htsearch.cc    Tue Aug 24 12:34:23 1999
@@ -222,9 +222,11 @@ main(int ac, char **av)
     //
     origPattern += logicalPattern;
     searchWordsPattern.IgnoreCase();
-    searchWordsPattern.Pattern(origPattern);
-    if (debug > 2)
-      cout << "Excerpt pattern: " << origPattern << "\n";
+    searchWordsPattern.IgnorePunct();
+    searchWordsPattern.Pattern(logicalPattern);        // this should now be enough
+    //searchWordsPattern.Pattern(origPattern);
+    //if (debug > 2)
+    //  cout << "Excerpt pattern: " << origPattern << "\n";
 
     //
     // If required keywords were given in the search form, we will
@@ -314,7 +316,8 @@ createLogicalWords(List &searchWords, St
        }
        else
            wasHidden = 1;
-       if (ww->weight > 0)                     // Ignore boolean syntax stuff
+       if (ww->weight > 0                      // Ignore boolean syntax stuff
+           && !ww->isIgnore)                   // Ignore short or bad words
        {
            if (pattern.length())
                pattern << '|';


-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word unsubscribe in
the SUBJECT of the message.

Reply via email to