These are some rough patches to reject duplicates in htdig
by using MD5 checksums, they seem to work. Any comments on them,
they still need things adding configuration file parameter, compile
test etc.
Note:
They require libmhash, I have it on my Debian Linux system
but I have noidea how common this library is.
You need to run automake and autoconf to get the compile to work
The md5.cc file goes into the htdig directory
The md5hash file gets written to the current directory.
--
Toivo Pedaste Email: [EMAIL PROTECTED]
University Communications Services, Phone: +61 8 9 380 2605
University of Western Australia Fax: +61 8 9 380 1109
"The time has come", the Walrus said, "to talk of many things"...
diff -u -r htdig3/htdig/Makefile.am htdig3.new/htdig/Makefile.am
--- htdig3/htdig/Makefile.am Tue Mar 28 09:44:51 2000
+++ htdig3.new/htdig/Makefile.am Sun Aug 20 22:38:22 2000
@@ -6,11 +6,11 @@
htdig_SOURCES = Document.cc HTML.cc \
Parsable.cc Plaintext.cc \
Retriever.cc Server.cc ExternalTransport.cc \
- URLRef.cc htdig.cc ExternalParser.cc
+ URLRef.cc htdig.cc ExternalParser.cc md5.cc
noinst_HEADERS = Document.h ExternalParser.h HTML.h \
Parsable.h Plaintext.h Retriever.h Server.h URLRef.h htdig.h \
ExternalTransport.h
htdig_DEPENDENCIES = $(HTLIBS)
-htdig_LDFLAGS = $(PROFILING)
+htdig_LDFLAGS = $(PROFILING) -lmhash
htdig_LDADD = $(HTLIBS)
diff -u -r htdig3/htdig/Retriever.cc htdig3.new/htdig/Retriever.cc
--- htdig3/htdig/Retriever.cc Wed Jun 14 09:49:53 2000
+++ htdig3.new/htdig/Retriever.cc Sun Aug 20 22:37:45 2000
@@ -107,6 +107,12 @@
unlink((char*)filelog);
}
+ d_md5 = Database::getDatabaseInstance(DB_HASH);
+
+ if (d_md5->OpenReadWrite("md5hash", 0666) != OK) {
+ cerr << "DocumentDB::Open: " << "md5hash" << " " << strerror(errno) << "\n";
+ }
+
}
@@ -115,6 +121,7 @@
//
Retriever::~Retriever()
{
+ d_md5->Close();
delete doc;
}
@@ -559,10 +566,32 @@
// Determine what to do by looking at the status code returned by
// the Document retrieval process.
//
+
+#define MD5_LENGTH 16
+ void md5(char *buf, int len, char * rhash);
+
+ String shash;
+ String sx;
+ char bhash[16];
+
switch (status)
{
+
case Transport::Document_ok:
trackWords = 1;
+
+ md5(doc->Contents(),doc->Length(),bhash);
+ shash.append(bhash,MD5_LENGTH);
+
+ d_md5->Get(shash,sx);
+ if (!sx.empty()) {
+ if (debug) {
+ cout << "DUP\n";
+ }
+ break;
+ }
+ d_md5->Put(shash,"x");
+
if (old_document)
{
if (doc->ModTime() == ref->DocTime())
diff -u -r htdig3/htdig/Retriever.h htdig3.new/htdig/Retriever.h
--- htdig3/htdig/Retriever.h Tue Mar 28 09:49:12 2000
+++ htdig3.new/htdig/Retriever.h Sun Aug 20 22:43:24 2000
@@ -24,6 +24,7 @@
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
+#include "DocumentDB.h"
class URL;
class Document;
@@ -129,6 +130,8 @@
// we reuse.
//
Document *doc;
+
+ Database *d_md5;
String notFound;
#include <mhash.h>
#include <stdio.h>
extern int debug;
#define MD5_LENGTH 16
void md5(char *buf, int len, char *rhash)
{
int i;
MHASH td;
char *hash;
if (MD5_LENGTH != mhash_get_block_size(MHASH_MD5)) {
printf("Error MD5 Length\n");
}
td = mhash_init(MHASH_MD5);
mhash(td, buf, len);
hash = (char *)mhash_end(td);
memcpy(rhash,hash,MD5_LENGTH);
if (debug) {
printf(" ");
for (i = 0; i < MD5_LENGTH; i++) {
printf("%.2x", hash[i]);
}
printf(" ");
}
}
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.