Hi, folks.  Among the many changes I've committed to the 3.1.x source
tree, is the one below.  I'm a bit nervous about it, because it's in
a pretty critical piece of code, but as far as I can tell it's doing
what it should.  I'd appreciate a few more eyeballs looking over it,
and perhaps also testing it, just to be sure.

In the process of adding support for img alt text, and fixing the meta
description parsing, I also uncovered a few other problems, as far as
I can tell.

1) The handling of meta keywords and meta descriptions kept right on
going even if doindex was 0, so a noindex tag had no effect on these.

2) The handling of meta keywords and meta descriptions didn't
consider word offsets in the document - it used a relative offset of
1 for everything.  While not really a problem for the keywords tags,
it seemed wrong to me that it did that for meta descriptions too.
I changed the latter.

3) The relative offset calculation was done by dividing by the total
size of the document, before any stripping of comments, JavaScript, etc.
That meant that the more stuff got stripped out, the lower the offset
(and the higher the importance) of remaining words.  I've changed it to
use the size after stripping.

It appears that (2) and (3) are no longer an issue in 3.2, but (1)
still is.  Anyway, I'd appreciate some feedback on these fixes, as
well as the img alt handling and my HtWordtoken() function.  It all
seems to work, as far as I can tell, but I may have missed something.


Fri Dec  3 10:52:57 1999  Gilles Detillieux  <[EMAIL PROTECTED]>

        * htdig/HTML.cc(parse, do_tag): Add handling of <img alt=...> text,
        fix parsing of words in meta tags, disable indexing of meta tags
        when "noindex" state in effect, fix calculations of word positions
        to more accurately reflect relative positions.
        * htlib/HtWordType.h, htlib/HtWordType.cc: Add HtWordToken() function,
        to replace strtok() in HTML parser.

--- htdig-3.1.3/htdig/HTML.cc   Fri Nov 26 17:10:36 1999
+++ htdig-3.1.4-dev/htdig/HTML.cc       Fri Dec  3 10:05:49 1999
@@ -27,6 +27,8 @@ static StringMatch    attrs;
 static StringMatch     srcMatch;
 static StringMatch     hrefMatch;
 static StringMatch     keywordsMatch;
+static int             offset;
+static int             totlength;
 
 
 //*****************************************************************************
@@ -139,7 +141,6 @@ HTML::parse(Retriever &retriever, URL &b
     // We have some variables which will contain the various items we
     // are looking for
     //
-    int                        offset = 0;
     int                        in_space;
     int                        in_punct;
     unsigned char      *q, *start;
@@ -149,6 +150,7 @@ HTML::parse(Retriever &retriever, URL &b
     static char         *skip_start = config["noindex_start"];
     static char         *skip_end = config["noindex_end"];
 
+    offset = 0;
     title = 0;
     head = 0;
     meta_dsc = 0;
@@ -268,6 +270,7 @@ HTML::parse(Retriever &retriever, URL &b
         }
       }
       *ptext++ = '\0';
+      totlength = ptext - text;
 
       position = text;
       start = position;
@@ -388,7 +391,7 @@ HTML::parse(Retriever &retriever, URL &b
            if (word.length() >= minimumWordLength && doindex)
            {
              retriever.got_word(word,
-                                int(offset * 1000 / contents->length()),
+                                int(offset * 1000 / totlength),
                                 in_heading);
            }
        }
@@ -662,6 +665,33 @@ HTML::do_tag(Retriever &retriever, Strin
 
        case 18:        // "img"
        {
+           // Handle alt parameter
+           Configuration       conf;
+           conf.NameValueSeparators("=");
+           conf.Add(position+length);
+           if (conf["alt"])
+           {
+               char    *alttxt = transSGML(conf["alt"]);
+               if (doindex && in_title)
+                   title << alttxt << " ";
+               if (in_ref && description.length() < max_description_length)
+                   description << alttxt << " ";
+               if (doindex && !in_title && head.length() < max_head_length)
+                   head << alttxt << " ";
+               char    *w = HtWordToken(alttxt);
+               while (w && doindex)
+               {
+                   if (strlen(w) >= minimumWordLength)
+                     retriever.got_word(w,
+                                int((offset+(w-alttxt)) * 1000
+                                       / totlength),
+                                in_heading);
+                   w = HtWordToken(0);
+               }
+               w = '\0';
+           }
+
+           // Handle src parameter
            which = -1;
            int pos = attrs.FindFirstWord(position, which, length);
            if (pos < 0 || which != 0)
@@ -759,12 +789,12 @@ HTML::do_tag(Retriever &retriever, Strin
                char    *keywords = conf["htdig-keywords"];
                if (!keywords)
                    keywords = conf["keywords"];
-               char    *w = strtok(transSGML(keywords), " ,\t\r\n");
-               while (w)
+               char    *w = HtWordToken(transSGML(keywords));
+               while (w && doindex)
                {
                    if (strlen(w) >= minimumWordLength)
                      retriever.got_word(w, 1, 10);
-                   w = strtok(0, " ,\t\r\n");
+                   w = HtWordToken(0);
                }
                w = '\0';
            }
@@ -826,24 +856,28 @@ HTML::do_tag(Retriever &retriever, Strin
                   // (slot 11 is the new slot for this)
                   //
 
-                  char        *w = strtok(transSGML(conf["content"]), " \t\r\n");
-                   while (w)
+                  char        *words = HtWordToken(transSGML(conf["content"]));
+                  char        *w = words;
+                   while (w && doindex)
                     {
                        if (strlen(w) >= minimumWordLength)
-                         retriever.got_word(w, 1, 11);
-                       w = strtok(0, " \t\r\n");
+                         retriever.got_word(w,
+                                int((offset+(w-words)) * 1000
+                                       / totlength),
+                                11);
+                       w = HtWordToken(0);
                     }
                 w = '\0';
                }
 
                if (keywordsMatch.CompareWord(cache))
                {
-                   char        *w = strtok(transSGML(conf["content"]), " ,\t\r\n");
-                   while (w)
+                   char        *w = HtWordToken(transSGML(conf["content"]));
+                   while (w && doindex)
                    {
                        if (strlen(w) >= minimumWordLength)
                          retriever.got_word(w, 1, 10);
-                       w = strtok(0, " ,\t\r\n");
+                       w = HtWordToken(0);
                    }
                    w = '\0';
                }
--- htdig-3.1.3/htlib/HtWordType.h      Wed Sep 22 11:18:43 1999
+++ htdig-3.1.4-dev/htlib/HtWordType.h  Fri Dec  3 11:15:33 1999
@@ -76,5 +76,8 @@ HtStripPunctuation(String &s)
 {
   s.remove(HtWordType::statics.valid_punctuation);
 }
+
+// Like strtok(), but using our rules for word separation.
+char *HtWordToken(char *s);
 
 #endif /* __HtWordType_h */
--- htdig-3.1.3/htlib/HtWordType.cc     Wed Sep 22 11:18:43 1999
+++ htdig-3.1.4-dev/htlib/HtWordType.cc Fri Dec  3 11:15:57 1999
@@ -36,4 +36,30 @@ HtWordType::Initialize(Configuration &co
     if (strchr(valid_punct, i))
        HtWordType::statics.chrtypes[i] |= HtWt_ValidPunct;
   }
+}
+
+
+//  much like strtok(), and destructive of the source string like strtok(),
+//  but does word separation by our rules.
+char *
+HtWordToken(char *str)
+{
+    unsigned char              *text = (unsigned char *)str;
+    char                       *ret = 0;
+    static unsigned char       *prev = 0;
+
+    if (!text)
+       text = prev;
+    while (text && *text && !HtIsStrictWordChar(*text))
+       text++;
+    if (text && *text)
+    {
+       ret = (char *)text;
+       while (*text && HtIsWordChar(*text))
+           text++;
+       if (*text)
+           *text++ = '\0';
+    }
+    prev = text;
+    return ret;
 }


-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] 
You will receive a message to confirm this. 

Reply via email to