According to Benjamin Smedberg:
> Having seen the argument about comment parsing both here and on the regular
> list, I think that there is a solution that will please everybody:
> 
> Once you've started a comment, check each dash: if it is followed by only
> dashes, whitespace, up to the next right bracket, then end the comment.
> Otherwise, keep going.
> 
> This follows the standard and allows for quite a bit of non-standard) coding
> as well.

Yes, this is more or less what we had come to agree upon.  I've finally
taken the time to implement it.  Here is the new code, first as a snippet,
so you can see what it does, and as a patch to the 3.1.1 source, so you
can give it a try.  I'd appreciate it if everyone who participated in this
discussion, or who had problems with parsing comments in the past, to give
this a try.  It past the few test cases I threw at it, but I'd like people
to hammer away at it for a while.

      if (strncmp((char *)position, "<!", 2) == 0)
        {
          //
          // Possible comment declaration (but could be DTD declaration!)
          // A comment can contain other '<' and '>':
          // we have to ignore complete comment declarations
          // but of course also DTD declarations.
          //
          position += 2;        // Get past declaration start
          if (strncmp((char *)position, "--", 2) == 0)
            {
              // Found start of comment - now find the end
              position += 2;
              do
                {
                  q = (unsigned char*)strstr((char *)position, "--");
                  if (!q)
                    {
                      *position = '\0';
                      break;    // Rest of document seems to be a comment...
                    }
                  else
                    {
                      position = q + 2;
                      // Skip dashes after a badly formed comment
                      while (*position == '-')
                          position++;
                      // Skip whitespace after an individual comment
                      while (isspace(*position))
                          position++;
                    }
                  // if comment declaration hasn't ended, skip another comment
                }
              while (*position && *position != '>');
              if (*position == '>')
                {
                  position++;   // End of comment declaration
                }
            }
          else
            {
              // Not a comment declaration after all
              // but possibly DTD: get to the end
              q = (unsigned char*)strstr((char *)position, ">");
              if (q)
                {
                  position = q + 1;
                  // End of (whatever) declaration
                }
              else
                {
                  *position = '\0'; // Rest of document is DTD?
                }
            }
          continue;
        }


Apply this patch to htdig/HTML.cc from ht://Dig 3.1.1 (or later).

--- htdig/HTML.cc.commentbug    Wed Feb 24 12:36:59 1999
+++ htdig/HTML.cc       Wed Mar 17 11:01:08 1999
@@ -140,54 +140,54 @@ HTML::parse(Retriever &retriever, URL &b
          //
          // Possible comment declaration (but could be DTD declaration!)
          // A comment can contain other '<' and '>':
-         // we have to ignore a complete comment declarations
+         // we have to ignore complete comment declarations
          // but of course also DTD declarations.
          //
          position += 2;        // Get past declaration start
-         while (*position)
+         if (strncmp((char *)position, "--", 2) == 0)
            {
-             // Let's see if the declaration ends here
-             if (*position == '>')
-               {
-                 position++;
-                 break;        // End of comment declaration
-               }
-             // Not the end of the declaration yet:
-             // we'll try to find an actual comment
-             if (strncmp((char *)position, "--", 2) == 0)
+             // Found start of comment - now find the end
+             position += 2;
+             do
                {
-                 // Found start of comment - now find the end
-                 position += 2;
                  q = (unsigned char*)strstr((char *)position, "--");
                  if (!q)
                    {
                      *position = '\0';
                      break;    // Rest of document seems to be a comment...
                    }
-                 position = q + 2;
-               }
-             else
-               {
-                 // Not a comment declaration after all
-                 // but possibly DTD: get to the end
-                 q = (unsigned char*)strstr((char *)position, ">");
-                 if (q)
-                   {
-                     position = q + 1;
-                     break;
-                     // End of (whatever) declaration
-                   }
                  else
                    {
-                     *position = '\0'; // Rest of document is DTD?
-                     break;
+                     position = q + 2;
+                     // Skip extra dashes after a badly formed comment
+                     while (*position == '-')
+                         position++;
+                     // Skip whitespace after an individual comment
+                     while (isspace(*position))
+                         position++;
                    }
-                 
+                 // if comment declaration hasn't ended, skip another comment
+               }
+             while (*position && *position != '>');
+             if (*position == '>')
+               {
+                 position++;   // End of comment declaration
+               }
+           }
+         else
+           {
+             // Not a comment declaration after all
+             // but possibly DTD: get to the end
+             q = (unsigned char*)strstr((char *)position, ">");
+             if (q)
+               {
+                 position = q + 1;
+                 // End of (whatever) declaration
+               }
+             else
+               {
+                 *position = '\0'; // Rest of document is DTD?
                }
-             
-             // Skip whitespace after an individual comment
-             while (isspace(*position))
-               position++;
            }
          continue;
        }

-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.

Reply via email to