Here's a quickie that someone else might like to verify if they've 
run into the same problem.  When htdig encounters an entity that it 
doesn't know about (say ’ - which should really be ’ but 
that's another issue) it copies it verbatim to the extract - so far
so good.  When the extract is sent out in Display::hilight, the
extract is decoded with HtSGMLCodec to transform the unsigned char
characters to entities, and as well as the characters above 160 it
translates & to &, which is fine except when & is the start of 
an entity.  This is what leaves things like &146; in extracts.
Here's a patch to HtSGMLCodec::decode to make sure that it doesn't break
real entities.


======================================
diff -rup htdig/htcommon/HtSGMLCodec.cc 
htdig-patch2/htcommon/HtSGMLCodec.cc
--- htdig/htcommon/HtSGMLCodec.cc       Fri Oct 20 16:40:55 2000
+++ htdig-patch2/htcommon/HtSGMLCodec.cc        Tue Oct 16 15:37:05 2001
@@ -19,6 +19,8 @@
 
 #include "HtSGMLCodec.h"
 
+#include <ctype.h>
+
 // Constructor: parses the appropriate parameters using the
 // encapsulated HtWordCodec class.
 // Only used in privacy.
@@ -106,5 +108,92 @@ HtSGMLCodec::instance()
 
   return _instance;
 }
+
+
+// ***********************************************
+int
+HtSGMLCodec::IsEntity( const String &entity ) const
+{
+    // entity if starts with &, finishes with ;, has no spaces, is at 
least 3 chars long
+    // if second char is # and the third is not x, the others are decimal 
digits, min len 4
+    // if the entity starts with &#x then the remaining digits must be 
hexidecimal, min len 5
+    // I'm not supporting entities that don't end with a semi-colon.
+
+    int is_decimal  = 0;
+    int is_hex      = 0;
+    int len         = entity.length();
+    int start       = 1;
+
+    if (len < 3 && entity[0] != '&' && entity[len-1] != ';' )
+        return 0;
+
+    if ( entity[1] == '#' )
+    {
+        if ( len > 3 && ( entity[2] == 'x' || entity[2] == 'X' ) ) {
+            is_hex = 1;
+            start = 3;
+            if ( len < 5 )
+                return 0;
+        } else {
+            is_decimal = 1;
+            start = 2;
+            if ( len < 4 )
+                return 0;
+        }
+    }
+
+    for (int i = start; i < len-start-1; i++ )
+    {
+        if ( !isalnum( entity[i] ) )
+            return 0;
+
+        if ( is_decimal && !isdigit( entity[i] ) )
+            return 0;
+        if ( is_hex && !isxdigit( entity[i] ) )
+            return 0;
+    }
+    return 1;
+}
+
+
+// ***********************************************
+String HtSGMLCodec::decode(const String &coded) const
+{
+    String out;
+    int semi_pos = -1;
+    int amp_pos = coded.indexOf( '&' );
+    int last_pos = 0;
+
+    while( last_pos <= coded.length() )
+    {
+        amp_pos = coded.indexOf( '&', last_pos );
+
+        if ( amp_pos != -1 )
+            semi_pos = coded.indexOf( ';', amp_pos+1 );
+        else
+            semi_pos = -1;
+
+        if ( amp_pos == -1 || semi_pos == -1 ) // no more possible 
entities
+        {
+            out << myTextWordCodec->decode( coded.sub( last_pos) );
+            break;
+        }
+        semi_pos++; // jump over the semi-colon
+
+        if ( IsEntity( coded.sub(amp_pos, semi_pos - amp_pos ) ) )
+        {
+            out << myTextWordCodec->decode( coded.sub(last_pos, amp_pos - 
last_pos ) );
+            out << coded.sub(amp_pos, semi_pos - amp_pos );
+        }
+        else
+        {
+            out << myTextWordCodec->decode( coded.sub( last_pos, semi_pos 
- amp_pos ) );
+        }
+        last_pos = semi_pos;
+    }
+
+    return out;
+}
+
 
 // End of HtSGMLCodec.cc

diff -rup htdig/htcommon/HtSGMLCodec.h htdig-patch2/htcommon/HtSGMLCodec.h
--- htdig/htcommon/HtSGMLCodec.h        Fri Oct 20 16:40:55 2000
+++ htdig-patch2/htcommon/HtSGMLCodec.h Tue Oct 16 15:33:08 2001
@@ -33,8 +33,8 @@ public:
   { return myTextWordCodec->encode(myNumWordCodec->encode(uncoded)); }
 
   // But we only want to decode into one form i.e. &foo; NOT &#nnn;
-  String decode(const String &coded) const
-  { return myTextWordCodec->decode(coded); }
+  // but we don't want to decode & if it's part of an entity.
+  String decode(const String &coded) const;
 
   // If an error was discovered during the parsing of
   // entities, this returns an error message
@@ -54,6 +54,9 @@ private:
   HtSGMLCodec();
   HtSGMLCodec(const HtSGMLCodec &);
   void operator= (const HtSGMLCodec &);
+
+  //! returns true if the parameter is an entity.
+  int IsEntity( const String &entity ) const;
 
   HtWordCodec *myTextWordCodec; // For &foo;
   HtWordCodec *myNumWordCodec; // For &#foo;


======================================



Jamie Anstice
Search Engineer
S.L.I. Systems
[EMAIL PROTECTED]
ph:  64 961 3262
mobile: 64 21 264 9347

_______________________________________________
htdig-general mailing list <[EMAIL PROTECTED]>
To unsubscribe, send a message to <[EMAIL PROTECTED]> with a 
subject of unsubscribe
FAQ: http://htdig.sourceforge.net/FAQ.html

Reply via email to