According to Patrick:
> I've also noticed that if you searched for "finding my email", and
> "my" is in the "badwords" file, it will also be highlighted in the
> excerpt. May I ask if it is possible that you also append this
> into the patch?
>
> Currently, htdig/htsearch truly ignore words in the "badwords" file,
> but they still come up in the excerpt highlighted.
Thanks for the tip. Here's a revised patch that will make htsearch
ignore words in excerpts that are in the bad_word_list file, or that
are shorter than minimum_word_length, in addition to skipping over
punctuation:
--- htdig-3.1.2.bak/htlib/StringMatch.h Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.h Mon Aug 23 15:38:31 1999
@@ -98,6 +98,12 @@ public:
void IgnoreCase();
//
+ // Build a local translation table which ignores all given punctuation
+ // characters
+ //
+ void IgnorePunct(char *punct = NULL);
+
+ //
// Determine if there is a pattern associated with this Match object.
//
int hasPattern() {return table[0] != 0;}
--- htdig-3.1.2.bak/htlib/StringMatch.cc Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.cc Mon Aug 23 16:40:14 1999
@@ -90,6 +90,8 @@ StringMatch::Pattern(char *pattern, char
table[i] = new int[n];
memset((unsigned char *) table[i], 0, n * sizeof(int));
}
+ for (i = 0; i < n; i++)
+ table[0][i] = i; // "no-op" states for null char, to be ignored
//
// Set up a standard case translation table if needed.
@@ -127,6 +129,11 @@ StringMatch::Pattern(char *pattern, char
#endif
chr = trans[(unsigned char)*pattern];
+ if (chr == 0)
+ {
+ pattern++;
+ continue;
+ }
if (chr == sep)
{
//
@@ -504,12 +511,39 @@ void StringMatch::TranslationTable(char
//
void StringMatch::IgnoreCase()
{
- if (local_alloc)
- delete [] trans;
- trans = new unsigned char[256];
+ if (!local_alloc || !trans)
+ {
+ trans = new unsigned char[256];
+ for (int i = 0; i < 256; i++)
+ trans[i] = (unsigned char)i;
+ local_alloc = 1;
+ }
for (int i = 0; i < 256; i++)
- trans[i] = tolower((unsigned char)i);
- local_alloc = 1;
+ if (isupper((unsigned char)i))
+ trans[i] = tolower((unsigned char)i);
+}
+
+
+//*****************************************************************************
+// void StringMatch::IgnorePunct(char *punct)
+// Set up the character translation table to ignore punctuation
+//
+void StringMatch::IgnorePunct(char *punct)
+{
+ if (!local_alloc || !trans)
+ {
+ trans = new unsigned char[256];
+ for (int i = 0; i < 256; i++)
+ trans[i] = (unsigned char)i;
+ local_alloc = 1;
+ }
+ if (punct)
+ for (int i = 0; punct[i]; i++)
+ trans[(unsigned char)punct[i]] = 0;
+ else
+ for (int i = 0; i < 256; i++)
+ if (HtIsWordChar(i) && !HtIsStrictWordChar(i))
+ trans[i] = 0;
}
--- htdig-3.1.2.bak/htsearch/htsearch.cc Wed Aug 18 16:40:30 1999
+++ htdig-3.1.2/htsearch/htsearch.cc Tue Aug 24 12:34:23 1999
@@ -222,9 +222,11 @@ main(int ac, char **av)
//
origPattern += logicalPattern;
searchWordsPattern.IgnoreCase();
- searchWordsPattern.Pattern(origPattern);
- if (debug > 2)
- cout << "Excerpt pattern: " << origPattern << "\n";
+ searchWordsPattern.IgnorePunct();
+ searchWordsPattern.Pattern(logicalPattern); // this should now be enough
+ //searchWordsPattern.Pattern(origPattern);
+ //if (debug > 2)
+ // cout << "Excerpt pattern: " << origPattern << "\n";
//
// If required keywords were given in the search form, we will
@@ -314,7 +316,8 @@ createLogicalWords(List &searchWords, St
}
else
wasHidden = 1;
- if (ww->weight > 0) // Ignore boolean syntax stuff
+ if (ww->weight > 0 // Ignore boolean syntax stuff
+ && !ww->isIgnore) // Ignore short or bad words
{
if (pattern.length())
pattern << '|';
--
Gilles R. Detillieux E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre WWW: http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba Phone: (204)789-3766
Winnipeg, MB R3E 3J7 (Canada) Fax: (204)789-3930
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED] containing the single word unsubscribe in
the SUBJECT of the message.