These are some rough patches to reject duplicates in htdig
by using MD5 checksums, they seem to work. Any comments on them,
they still need things adding configuration file parameter, compile
test etc.

Note:
        They require libmhash, I have it on my Debian Linux system
        but I have noidea how common this library is.

        You need to run automake and autoconf to get the compile to work

        The md5.cc file goes into the htdig directory

        The md5hash file gets written to the current directory.

-- 
 Toivo Pedaste                        Email:  [EMAIL PROTECTED]
 University Communications Services,  Phone:  +61 8 9 380 2605
 University of Western Australia      Fax:    +61 8 9 380 1109
"The time has come", the Walrus said, "to talk of many things"...
diff -u -r htdig3/htdig/Makefile.am htdig3.new/htdig/Makefile.am
--- htdig3/htdig/Makefile.am    Tue Mar 28 09:44:51 2000
+++ htdig3.new/htdig/Makefile.am        Sun Aug 20 22:38:22 2000
@@ -6,11 +6,11 @@
 htdig_SOURCES = Document.cc HTML.cc \
        Parsable.cc Plaintext.cc \
        Retriever.cc Server.cc ExternalTransport.cc \
-       URLRef.cc htdig.cc ExternalParser.cc 
+       URLRef.cc htdig.cc ExternalParser.cc md5.cc
 
 noinst_HEADERS = Document.h ExternalParser.h HTML.h \
        Parsable.h Plaintext.h Retriever.h Server.h  URLRef.h htdig.h \
        ExternalTransport.h
 htdig_DEPENDENCIES = $(HTLIBS)
-htdig_LDFLAGS = $(PROFILING)
+htdig_LDFLAGS = $(PROFILING) -lmhash
 htdig_LDADD = $(HTLIBS)
diff -u -r htdig3/htdig/Retriever.cc htdig3.new/htdig/Retriever.cc
--- htdig3/htdig/Retriever.cc   Wed Jun 14 09:49:53 2000
+++ htdig3.new/htdig/Retriever.cc       Sun Aug 20 22:37:45 2000
@@ -107,6 +107,12 @@
         unlink((char*)filelog);
     }
     
+    d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+    if (d_md5->OpenReadWrite("md5hash", 0666) != OK) {
+      cerr << "DocumentDB::Open: " << "md5hash" << " " << strerror(errno) << "\n";
+    }
+
 }
 
 
@@ -115,6 +121,7 @@
 //
 Retriever::~Retriever()
 {
+    d_md5->Close();
     delete doc;
 }
 
@@ -559,10 +566,32 @@
     // Determine what to do by looking at the status code returned by
     // the Document retrieval process.
     //
+
+#define MD5_LENGTH 16
+    void md5(char *buf, int len, char * rhash);
+
+    String shash;
+    String sx;
+    char bhash[16];
+
     switch (status)
     {
+
        case Transport::Document_ok:
            trackWords = 1;
+
+           md5(doc->Contents(),doc->Length(),bhash);
+           shash.append(bhash,MD5_LENGTH);
+
+           d_md5->Get(shash,sx);
+           if (!sx.empty()) {
+             if (debug) {
+               cout << "DUP\n";
+             }
+             break;
+           }
+           d_md5->Put(shash,"x");
+
            if (old_document)
            {
              if (doc->ModTime() == ref->DocTime())
diff -u -r htdig3/htdig/Retriever.h htdig3.new/htdig/Retriever.h
--- htdig3/htdig/Retriever.h    Tue Mar 28 09:49:12 2000
+++ htdig3.new/htdig/Retriever.h        Sun Aug 20 22:43:24 2000
@@ -24,6 +24,7 @@
 #include "HtWordReference.h"
 #include "List.h"
 #include "StringList.h"
+#include "DocumentDB.h"
 
 class URL;
 class Document;
@@ -129,6 +130,8 @@
     // we reuse.
     //
     Document           *doc;
+
+    Database           *d_md5;
 
     String             notFound;
 
#include <mhash.h>
#include <stdio.h>

extern int debug;
                    
#define MD5_LENGTH 16

void md5(char *buf, int len, char *rhash)
{

  int i;
  MHASH td;    
  char *hash;  

  if (MD5_LENGTH != mhash_get_block_size(MHASH_MD5)) {
    printf("Error MD5 Length\n");
  }
  td = mhash_init(MHASH_MD5);

  mhash(td, buf, len);
  hash = (char *)mhash_end(td);
  memcpy(rhash,hash,MD5_LENGTH);
             
  if (debug) {
    printf(" ");
    for (i = 0; i < MD5_LENGTH; i++) {
      printf("%.2x", hash[i]);
    }
    printf(" ");
  }
}  

------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED] 
You will receive a message to confirm this. 

Reply via email to