[htdig3-dev] Patch for halving the size of db.words.db (not necessarily for release purposes...)

Hans-Peter Nilsson Sat, 30 Jan 1999 14:55:44 -0500

I intended to sit on this patch until after the release, but
I don't see the point in keeping this this to myself until then,
at least for purposes of review.  And just because I post it here
and now does not make it necessary to put it into the release.

Using straightforward approaches it makes db.words.db (slightly
less than) half the size (compared to just a previously defining
NO_WORD_COUNT).  By the way, it probably matters that my example
database has less than 64K documents - you may see less savings,
down to (wildly guessing) maybe 20% with significantly larger
databases and using the default #undef of NO_WORD_COUNT.  I did
not test this with #undef NO_WORD_COUNT, but will do so ASAP.

There is also a backward-compatibility problem; you cannot drop
in a new htsearch with an old db.words.db without doing a
htmerge using the new htmerge.  On the other hand, other recent
changes such as not lowercasing the urls in the database also
loses backward compatibility, so that's moot by now.  (I'm not
saying that change wasn't the Right Thing; I think it was).

If Geoff so pleases, I can commit this.  Or he can just sneeze
at this clumsy attempt to avert priorities for the release. ;-)
Or whatever.

Sat Jan 30 16:40:38 1999  Hans-Peter Nilsson  <[EMAIL PROTECTED]>

        * htmerge/words.cc (mergeWords): Pack WordRecords in db.
        * htsearch/parser.cc (perform_push): Unpack WordRecords from db.

        * htlib/HtPack.cc: New file.
        * htlib/HtPack.h: New file.
        * htlib/Makefile.in (OBJS): Add corresponding *.o files.

Index: htlib/Makefile.in
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htlib/Makefile.in,v
retrieving revision 1.12
diff -p -c -c -3 -p -b -r1.12 Makefile.in
*** htlib/Makefile.in   1999/01/21 13:42:32     1.12
--- htlib/Makefile.in   1999/01/30 19:05:33
*************** OBJS=   Configuration.o Connection.o Datab
*** 15,21 ****
                URL.o URLTrans.o cgi.o \
                good_strtok.o io.o strcasecmp.o \
                strptime.o mytimegm.o HtCodec.o HtWordCodec.o \
!               HtURLCodec.o
  
  TARGET=               libht.a
  
--- 15,21 ----
                URL.o URLTrans.o cgi.o \
                good_strtok.o io.o strcasecmp.o \
                strptime.o mytimegm.o HtCodec.o HtWordCodec.o \
!               HtURLCodec.o HtPack.o
  
  TARGET=               libht.a
  
Index: htmerge/words.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htmerge/words.cc,v
retrieving revision 1.10
diff -p -c -c -3 -p -b -r1.10 words.cc
*** htmerge/words.cc    1999/01/25 04:55:54     1.10
--- htmerge/words.cc    1999/01/30 19:06:16
*************** static char RCSid[] = "$Id: words.cc,v 1
*** 43,48 ****
--- 43,49 ----
  #endif
  
  #include "htmerge.h"
+ #include "HtPack.h"
  
  
  //*****************************************************************************
*************** mergeWords(char *wordtmp, char *wordfile
*** 63,68 ****
--- 64,70 ----
      int               word_count = 0;
      WordRecord        wr, last_wr;
      String      last_word;
+     String      compressed_data;
  
      //
      // Check for file access errors
*************** mergeWords(char *wordtmp, char *wordfile
*** 239,251 ****
            // going to use (shorts and ints)
            //
  
            if (currentWord.length() == 0)
            {
                //
                // First word.  Special case.
                //
                out = 0;
!               out.append((char *) &last_wr, sizeof(last_wr));
                currentWord = last_word;
            }
            else if (strcmp(last_word, currentWord) == 0)
--- 241,261 ----
            // going to use (shorts and ints)
            //
  
+           // Or rather, a compressed form thereof.
+           compressed_data = htPack(
+ #ifdef NO_WORD_COUNT
+                                    "i4"
+ #else
+                                    "i5"
+ #endif
+                                    , (char *) &last_wr);
            if (currentWord.length() == 0)
            {
                //
                // First word.  Special case.
                //
                out = 0;
!               out.append(compressed_data);
                currentWord = last_word;
            }
            else if (strcmp(last_word, currentWord) == 0)
*************** mergeWords(char *wordtmp, char *wordfile
*** 253,259 ****
                //
                // Add to current record
                //
!               out.append((char *) &last_wr, sizeof(last_wr));
            }
            else
            {
--- 263,269 ----
                //
                // Add to current record
                //
!               out.append(compressed_data);
            }
            else
            {
*************** mergeWords(char *wordtmp, char *wordfile
*** 265,271 ****
                currentWord = last_word;
  
                out = 0;
!               out.append((char *) &last_wr, sizeof(last_wr));
                word_count++;
                if (verbose && word_count == 1)
                {
--- 275,281 ----
                currentWord = last_word;
  
                out = 0;
!               out.append(compressed_data);
                word_count++;
                if (verbose && word_count == 1)
                {
*************** mergeWords(char *wordtmp, char *wordfile
*** 315,327 ****
        }
      putc('\n', wordlist);
            
      if (currentWord.length() == 0)
        {
        //
        // First word.  Special case.
        //
        out = 0;
!       out.append((char *) &last_wr, sizeof(last_wr));
        currentWord = last_word;
        }
      else if (strcmp(last_word, currentWord) == 0)
--- 325,344 ----
        }
      putc('\n', wordlist);
            
+     compressed_data = htPack(
+ #ifdef NO_WORD_COUNT
+                              "i4"
+ #else
+                              "i5"
+ #endif
+                              , (char *) &last_wr);
      if (currentWord.length() == 0)
        {
        //
        // First word.  Special case.
        //
        out = 0;
!       out.append(compressed_data);
        currentWord = last_word;
        }
      else if (strcmp(last_word, currentWord) == 0)
*************** mergeWords(char *wordtmp, char *wordfile
*** 329,335 ****
        //
        // Add to current record
        //
!       out.append((char *) &last_wr, sizeof(last_wr));
        }
      else
        {
--- 346,352 ----
        //
        // Add to current record
        //
!       out.append(compressed_data);
        }
      else
        {
*************** mergeWords(char *wordtmp, char *wordfile
*** 341,347 ****
        currentWord = last_word;
        
        out = 0;
!       out.append((char *) &last_wr, sizeof(last_wr));
        word_count++;
        if (verbose && word_count == 1)
          {
--- 358,364 ----
        currentWord = last_word;
        
        out = 0;
!       out.append(compressed_data);
        word_count++;
        if (verbose && word_count == 1)
          {
Index: htsearch/parser.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htsearch/parser.cc,v
retrieving revision 1.6
diff -p -c -c -3 -p -b -r1.6 parser.cc
*** htsearch/parser.cc  1998/12/06 18:45:10     1.6
--- htsearch/parser.cc  1999/01/30 19:07:06
*************** static char RCSid[] = "$Id: parser.cc,v 
*** 32,37 ****
--- 32,38 ----
  #endif
  
  #include "parser.h"
+ #include "HtPack.h"
  
  #define       WORD    1000
  #define       DONE    1001
*************** Parser::perform_push()
*** 192,197 ****
--- 193,199 ----
  {
      String    temp = current->word.get();
      String    data;
+     String    decompressed;
      char      *p;
      ResultList        *list = new ResultList;
      WordRecord        wr;
*************** Parser::perform_push()
*** 213,223 ****
      if (dbf->Get(p, data) == OK)
      {
        p = data.get();
!       for (unsigned int i = 0; i < data.length() / sizeof(WordRecord); i++)
        {
!           p = data.get() + i * sizeof(WordRecord);
!           memcpy((char *) &wr, p, sizeof(WordRecord));
  
            //
            // *******  Compute the score for the document
            //
--- 215,230 ----
      if (dbf->Get(p, data) == OK)
      {
        p = data.get();
!       char *p_end = p + data.length();
!       while (p < p_end)
        {
!         decompressed = htUnpack(
! #ifdef NO_WORD_COUNT
!                                 "i4"
! #else
!                                 "i5"
! #endif
!                                 , p);
  
          //
          // *******  Compute the score for the document

brgds, H-P
-- 
Hans-Peter Nilsson, Axis Communications AB, S - 223 70 LUND, SWEDEN
[EMAIL PROTECTED] | Tel +46 462701867,2701800
Fax +46 46136130 | RFC 1855 compliance implemented; report loss of brain.
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] containing the single word "unsubscribe" in
the SUBJECT of the message.
[htdig3-dev] Patch for halving the size of db.words.db (not necessarily for release purposes...)

Reply via email to