Hi,
I just finished my "accents" fuzzy class. I don't think I'll go
further at this point. It's only for french because the translation table
is hardcoded but it would be pretty easy to make it file-based I suppose.
Now that I have done it, I must agree with you: It is better this way than
to go the "case" way if only because it is easier to migrate from one
release to the other :-)! It is also easy to activate or deactivate
on a 'per search" basis with the new build_select_list parameter.
So thanks again for your support you and Geoff.
My patches for release 3.1.5 are included at the end of the post. I have
not updated the documentation for now.
Patches for the "accents" fuzzy algorithm.
htcommon directory:
diff defaults.cc.orig defaults.cc
29a30
> {"accents_db", "${database_base}.accents.db"},
=========================
htfuzzy directory:
diff Fuzzy.cc.orig Fuzzy.cc
15a16
> #include "Accents.h"
173a175,176
> else if (mystrcasecmp(name, "accents") == 0)
> return new Accents();
=========================
diff htfuzzy.cc.orig htfuzzy.cc
45a46
> #include "Accents.h"
110a112,115
> else if (mystrcasecmp(av[i], "accents") == 0)
> {
> wordAlgorithms.Add(new Accents);
> }
239a245
> cout << "\taccents\n";
=========================
diff Makefile.in.orig Makefile.in
13c13
< Substring.o Prefix.o
---
> Substring.o Prefix.o Accents.o
17c17
< Substring.o Prefix.o
---
> Substring.o Prefix.o Accents.o
==========================
cat Accents.h
//
// Accents.h
//
// $Id: $
//
//
#ifndef _Accents_h_
#define _Accents_h_
#include "Fuzzy.h"
class Accents : public Fuzzy
{
public:
//
// Construction/Destruction
//
Accents();
virtual ~Accents();
virtual int writeDB(Configuration &config);
virtual void generateKey(char *word, String &key);
virtual void addWord(char *word);
private:
};
#endif
===================
cat Accents.cc
//
// Accents.cc
//
// Implementation of Accents
//
//
//
#if RELEASE
static char RCSid[] = "$Id: $";
#endif
#include "Configuration.h"
#include "htconfig.h"
#include "Accents.h"
#include "Dictionary.h"
#include <ctype.h>
#include <fstream.h>
extern int debug;
/*---------------------------------------------------------------.
| Ajoute par Robert Marchand pour permettre le traitement adequat de |
| l'ISO-LATIN (provient du code de Pierre Rosa) |
`---------------------------------------------------------------*/
/*--------------------------------------------------.
| table iso-latin1 "minusculisee" et "de-accentuee" |
`--------------------------------------------------*/
static char MinusculeISOLAT1[256] = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 91, 92, 93, 94, 95,
96, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135,
136, 137, 138, 139, 140, 141, 142, 143,
144, 145, 146, 147, 148, 149, 150, 151,
152, 153, 154, 155, 156, 157, 158, 159,
160, 161, 162, 163, 164, 165, 166, 167,
168, 168, 170, 171, 172, 173, 174, 175,
176, 177, 178, 179, 180, 181, 182, 183,
184, 185, 186, 187, 188, 189, 190, 191,
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
208, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
'o', 'u', 'u', 'u', 'u', 'y', 222, 223,
'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
240, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
'o', 'u', 'u', 'u', 'u', 'y', 254, 255};
//*****************************************************************************
// Accents::Accents()
//
Accents::Accents()
{
name = "accents";
}
//*****************************************************************************
// Accents::~Accents()
//
Accents::~Accents()
{
}
//*****************************************************************************
// int Accents::writeDB(Configuration &config)
//
int
Accents::writeDB(Configuration &config)
{
String var = name;
var << "_db";
String filename = config[var];
index = Database::getDatabaseInstance();
if (index->OpenReadWrite(filename, 0664) == NOTOK)
return NOTOK;
String *s;
char *fuzzyKey;
int count = 0;
dict->Start_Get();
while ((fuzzyKey = dict->Get_Next()))
{
s = (String *) dict->Find(fuzzyKey);
// Only add if meaningfull list
if (mystrcasecmp(fuzzyKey, s->get()) != 0) {
index->Put(fuzzyKey, *s);
if (debug > 1)
{
cout << "htfuzzy: '" << fuzzyKey << "' ==> '" << s->get() << "'\n"
;
}
count++;
if ((count % 100) == 0 && debug == 1)
{
cout << "htfuzzy: keys: " << count << '\n';
cout.flush();
}
}
}
if (debug == 1)
{
cout << "htfuzzy:Total keys: " << count << "\n";
}
return OK;
}
//*****************************************************************************
// void Accents::generateKey(char *word, String &key)
//
void
Accents::generateKey(char *word, String &key)
{
if (!word || !*word)
return;
key = '0';
while (*word) {
key << MinusculeISOLAT1[ *word++ ];
}
}
//*****************************************************************************
// void Accents::addWord(char *word)
//
void
Accents::addWord(char *word)
{
if (!dict)
{
dict = new Dictionary;
}
String key;
generateKey(word, key);
String *s = (String *) dict->Find(key);
if (s)
{
// if (mystrcasestr(s->get(), word) != 0)
(*s) << ' ' << word;
}
else
{
dict->Add(key, new String(word));
}
}
==========================
-------
Robert Marchand t�l: 343-6111 poste 5210
DiTER-SDI e-mail: [EMAIL PROTECTED]
Universit� de Montr�al Montr�al, Canada
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.