I posted this many moons ago to pgsql-hackers. 'Guess nobody noticed.
Tim
Josh Berkus wrote:
>Folks,
>
>For many of my programs, it would be extremely useful to have some form
>of "fuzzy matching" for VARCHAR fields. There are two kinds of fuzzy
>matching for words that I know of:
>
>1. Phonetic matching, which would be nice but will have to wait for
>someone's $100,000 project;
>
>2. Textual mathcing, which I will outline below.
>
>The way textual fuzzy matching should work is as follows:
>The developer supplies two VARCHARs to match and a number/percent of
>character mis-match that is acceptable:
>
>Fuzzy_match('Thornton','Tornton',1)
>
>And the fuzzy_match should return True if the two phrases are no more
>than that number of characters different. Thus, we should get:
>
>fuzzy_match('Thornton','Tornton',1) = TRUE
>fuzzy_match('Thornton','Torntin',1) = FALSE
>fuzzy_match('Thornton','Torntin',2) = TRUE
>
>Unfortunately, I cannot think of a way to make this happen in a function
>without cycling through all the possible permutations of characters for
>both words or doing some character-by-character comparison with
>elaborate logic for placement. Either of these approaches would be very
>slow, and completely unsuitable for column comparisons on large tables.
>
>Can anyone suggest some shortcuts here? Perhaps using pl/perl or
>something similar?
>
>Grazie!
>
>-Josh Berkus
>
>______AGLIO DATABASE SOLUTIONS___________________________
> Josh Berkus
> Complete information technology [EMAIL PROTECTED]
> and data management solutions (415) 565-7293
> for law firms, small businesses fax 621-2533
> and non-profit organizations. San Francisco
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>
>------------------------------------------------------------------------
>
>
>---------------------------(end of broadcast)---------------------------
>TIP 2: you can get off all lists at once with the unregister command
> (send "unregister YourEmailAddressHere" to [EMAIL PROTECTED])
>
> Part 1.2
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.3
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.4
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> base64
>
>
> ------------------------------------------------------------------------
> Part 1.5
>
> Content-Type:
>
> text/plain
> Content-Encoding:
>
> binary
>
>
--
Timothy H. Keitt
Department of Ecology and Evolution
State University of New York at Stony Brook
Stony Brook, New York 11794 USA
Phone: 631-632-1101, FAX: 631-632-7626
http://life.bio.sunysb.edu/ee/keitt/
/* Copyright (c) 2000 Timothy H. Keitt */
/* Licence: GPL version 2 or higher (see http://www.gnu.org/) */
#include "postgres.h"
#define STATIC_SIZE 32
/* This must be changed if STATIC_SIZE is changed */
static int4 static_array[STATIC_SIZE][STATIC_SIZE] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
{1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
/* Fast version for strings up to STATIC_SIZE characters */
int4
levenshtein_distance(text *s1, text *s2) {
register int i, j, l, m, n, add, rows, columns;
columns = VARSIZE(s1) - VARHDRSZ + 1;
rows = VARSIZE(s2) - VARHDRSZ + 1;
/* Use slower dynamically allocated version for larger strings */
if (columns > STATIC_SIZE || rows > STATIC_SIZE)
return levenshtein_distance_dynamic(s1, s2);
for (j=1; j<rows; ++j) {
for (i=1; i<columns; ++i) {
if (VARDATA(s1)[i-1] == VARDATA(s2)[j-1]) add=0; else add=1;
m = 1 + static_array[j-1][i];
l = 1 + static_array[j][i-1];
n = add + static_array[j-1][i-1];
static_array[j][i] = (m < l ? (m < n ? m : n): (l < n ? l : n));
} /* next column (i) */
} /* next row (j) */
return static_array[rows-1][columns-1];
} /* end levenshtein_distance(text *s1, text *s2) */
int4
levenshtein_distance_dynamic(text *s1, text *s2) {
register int i, j, l, m, n, add, rows, columns, out;
int4 *dynamic_array;
columns = VARSIZE(s1) - VARHDRSZ + 1;
rows = VARSIZE(s2) - VARHDRSZ + 1;
dynamic_array = (int4 *) palloc(rows * columns * sizeof(int4));
if (dynamic_array == NULL) return -1;
for (i=0; i<columns; ++i) dynamic_array[i] = i;
for (j=0; j<rows; ++j) dynamic_array[j*columns] = j;
for (j=1; j<rows; ++j) {
for (i=1; i<columns; ++i) {
if (VARDATA(s1)[i-1] == VARDATA(s2)[j-1]) add=0; else add=1;
m = 1 + dynamic_array[(j-1)*columns+i];
l = 1 + dynamic_array[j*columns+i-1];
n = add + dynamic_array[(j-1)*columns+i-1];
dynamic_array[j*columns+i] =
(m < l ? (m < n ? m : n): (l < n ? l : n));
} /* next column (i) */
} /* next row (j) */
out = dynamic_array[(rows-1)*columns+columns-1];
pfree(dynamic_array);
return out;
} /* end levenshtein_distance_dynamic(text *s1, text *s2) */
---------------------------(end of broadcast)---------------------------
TIP 4: Don't 'kill -9' the postmaster