-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 06/14/2013 12:08 PM, Liming Hu wrote: >>>>>>> I have implemented the code according to Joe's >>>>>>> suggestion, and put the code at: >>>>>>> https://github.com/liminghu/fuzzystrmatch/tree/fuzzystrmatchv1.1 >>>>>>> > >>>>>>> Please submit a proper patch so it can be seen on our mailing list >>>>>> archives. >>>>>> >>>>> Hi Alvaro, >>>>> >>>>> I am kind of new to the Postgresql hacker community, Can >>>>> you please help me on submit the patch?
I have not done much in the way of real review yet, but here at least is a context diff against git master. Joe - -- Joe Conway credativ LLC: http://www.credativ.us Linux, PostgreSQL, and general Open Source Training, Service, Consulting, & 24x7 Support -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.12 (GNU/Linux) Comment: Using GnuPG with undefined - http://www.enigmail.net/ iQIcBAEBAgAGBQJRyOw2AAoJEDfy90M199hlFhEP/2o/d08Fq4CHA9iI05PxDwQM AHF6PWS5gJToNLtiTevyEamWyzlULjX3yJ9gTAhxc+ltxE9Xuf3/bfMcJQSbJ5c9 6GSH7CWpaY1DpfAwvYiwTJ9EPBZ11u8VZaMrb1VU2DS8rioOOL3llzpk+D6/1no5 y327vlH1aOuqk3Hwu0c/WN8RAcf1HLbhafph2KruUfr/ba3uILBQZtzpyK4uCDew OJA+cktXHv3ho7w4N//xVQs3sZ/EoLahOt/y4syd3dN+Cq/8kj/uJaVgT2QY8rDQ HIBpYvm+b3DsYpjw2qrSVBriIupF2a0UPkLfC5cu3om8VvBilShydE6uTI+f+55p a12NuzepwgDnpZ1jOZkkmnBslpfoZI9TEo1UDeZ8x/TSZDW72FhwRtWq9kO9CFIK cJvG9B9E5zhyZx9V1C2S0qpvIJAj/Gns4ymvYU1lm5UUnpPSpTQoUK8ybrfnHoTE B0DEVjqxbrk9SSJ5LI3KojAaSMUIQDcjMQFCsghR1s5/yRhpZ7xEPvcL63ARwDcv ZeeL6r5G+nmKAfRAjGbmWi/Y1ANI0ucO5XxnfhSDb+TCQow4U6IgNvkYjN1hTNEI //9mQHDHi5qsuGcRcgvFLLeR4eVSGewseYLOR9XELMYTam4IUClwPr6WHuMqE/aE jvMZafqZ/1EhQ2SeqCo4 =wcGM -----END PGP SIGNATURE-----
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql fuzzystrmatch/fuzzystrmatch--1.0.sql
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql 2012-02-25 20:24:23.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch--1.0.sql 2013-06-11 04:09:01.000000000 -0500
***************
*** 1,4 ****
! /* contrib/fuzzystrmatch/fuzzystrmatch--1.0.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
--- 1,4 ----
! /* contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
***************
*** 19,24 ****
--- 19,44 ----
AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
LANGUAGE C IMMUTABLE STRICT;
+ CREATE FUNCTION dameraulevenshteinnocompatible (text,text,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs_noncompatible'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
CREATE FUNCTION metaphone (text,int) RETURNS text
AS 'MODULE_PATHNAME','metaphone'
LANGUAGE C IMMUTABLE STRICT;
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql fuzzystrmatch/fuzzystrmatch--1.1.sql
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql 1969-12-31 18:00:00.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch--1.1.sql 2013-06-11 04:09:01.000000000 -0500
***************
*** 0 ****
--- 1,60 ----
+ /* contrib/fuzzystrmatch/fuzzystrmatch--1.1.sql */
+
+ -- complain if script is sourced in psql, rather than via CREATE EXTENSION
+ \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
+
+ CREATE FUNCTION levenshtein (text,text) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_less_equal'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION metaphone (text,int) RETURNS text
+ AS 'MODULE_PATHNAME','metaphone'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION soundex(text) RETURNS text
+ AS 'MODULE_PATHNAME', 'soundex'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION text_soundex(text) RETURNS text
+ AS 'MODULE_PATHNAME', 'soundex'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION difference(text,text) RETURNS int
+ AS 'MODULE_PATHNAME', 'difference'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dmetaphone (text) RETURNS text
+ AS 'MODULE_PATHNAME', 'dmetaphone'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dmetaphone_alt (text) RETURNS text
+ AS 'MODULE_PATHNAME', 'dmetaphone_alt'
+ LANGUAGE C IMMUTABLE STRICT;
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.c fuzzystrmatch/fuzzystrmatch.c
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.c 2013-02-03 11:09:42.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch.c 2013-06-11 04:09:01.000000000 -0500
***************
*** 53,58 ****
--- 53,64 ----
extern Datum levenshtein(PG_FUNCTION_ARGS);
extern Datum levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
extern Datum levenshtein_less_equal(PG_FUNCTION_ARGS);
+
+ extern Datum dameraulevenshtein_with_costs(PG_FUNCTION_ARGS);
+ extern Datum dameraulevenshtein(PG_FUNCTION_ARGS);
+ extern Datum dameraulevenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
+ extern Datum dameraulevenshtein_less_equal(PG_FUNCTION_ARGS);
+
extern Datum metaphone(PG_FUNCTION_ARGS);
extern Datum soundex(PG_FUNCTION_ARGS);
extern Datum difference(PG_FUNCTION_ARGS);
***************
*** 193,199 ****
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c));
}
--- 199,205 ----
int del_c = PG_GETARG_INT32(3);
int sub_c = PG_GETARG_INT32(4);
! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c, 0));
}
***************
*** 204,210 ****
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1));
}
--- 210,216 ----
text *src = PG_GETARG_TEXT_PP(0);
text *dst = PG_GETARG_TEXT_PP(1);
! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1, 0));
}
***************
*** 219,225 ****
int sub_c = PG_GETARG_INT32(4);
int max_d = PG_GETARG_INT32(5);
! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, max_d));
}
--- 225,231 ----
int sub_c = PG_GETARG_INT32(4);
int max_d = PG_GETARG_INT32(5);
! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, 0, max_d));
}
***************
*** 231,240 ****
text *dst = PG_GETARG_TEXT_PP(1);
int max_d = PG_GETARG_INT32(2);
! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, max_d));
}
/*
* Calculates the metaphone of an input string.
* Returns number of characters requested
--- 237,298 ----
text *dst = PG_GETARG_TEXT_PP(1);
int max_d = PG_GETARG_INT32(2);
! PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, 0, max_d));
! }
!
! PG_FUNCTION_INFO_V1(dameraulevenshtein_with_costs);
! Datum
! dameraulevenshtein_with_costs(PG_FUNCTION_ARGS)
! {
! text *src = PG_GETARG_TEXT_PP(0);
! text *dst = PG_GETARG_TEXT_PP(1);
! int ins_c = PG_GETARG_INT32(2);
! int del_c = PG_GETARG_INT32(3);
! int sub_c = PG_GETARG_INT32(4);
! int trans_c = PG_GETARG_INT32(5);
!
! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c, trans_c));
! }
!
!
! PG_FUNCTION_INFO_V1(dameraulevenshtein);
! Datum
! dameraulevenshtein(PG_FUNCTION_ARGS)
! {
! text *src = PG_GETARG_TEXT_PP(0);
! text *dst = PG_GETARG_TEXT_PP(1);
!
! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1, 1));
}
+ PG_FUNCTION_INFO_V1(dameraulevenshtein_less_equal_with_costs);
+ Datum
+ dameraulevenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
+ {
+ text *src = PG_GETARG_TEXT_PP(0);
+ text *dst = PG_GETARG_TEXT_PP(1);
+ int ins_c = PG_GETARG_INT32(2);
+ int del_c = PG_GETARG_INT32(3);
+ int sub_c = PG_GETARG_INT32(4);
+ int trans_c = PG_GETARG_INT32(5);
+ int max_d = PG_GETARG_INT32(6);
+
+ PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, ins_c, del_c, sub_c, trans_c, max_d));
+ }
+
+
+ PG_FUNCTION_INFO_V1(dameraulevenshtein_less_equal);
+ Datum
+ dameraulevenshtein_less_equal(PG_FUNCTION_ARGS)
+ {
+ text *src = PG_GETARG_TEXT_PP(0);
+ text *dst = PG_GETARG_TEXT_PP(1);
+ int max_d = PG_GETARG_INT32(2);
+
+ PG_RETURN_INT32(levenshtein_less_equal_internal(src, dst, 1, 1, 1, 1, max_d));
+ }
+
/*
* Calculates the metaphone of an input string.
* Returns number of characters requested
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.control fuzzystrmatch/fuzzystrmatch.control
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch.control 2011-02-17 11:19:41.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch.control 2013-06-11 04:09:01.000000000 -0500
***************
*** 1,5 ****
# fuzzystrmatch extension
comment = 'determine similarities and distance between strings'
! default_version = '1.0'
module_pathname = '$libdir/fuzzystrmatch'
relocatable = true
--- 1,5 ----
# fuzzystrmatch extension
comment = 'determine similarities and distance between strings'
! default_version = '1.1'
module_pathname = '$libdir/fuzzystrmatch'
relocatable = true
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql 2012-02-25 20:24:23.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql 2013-06-11 04:09:01.000000000 -0500
***************
*** 1,10 ****
! /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.0.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text);
ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer);
ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer);
ALTER EXTENSION fuzzystrmatch ADD function soundex(text);
ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text);
--- 1,14 ----
! /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text);
ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer);
+
+ ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshteinnoncompatible(text,text,integer,integer,integer,integer);
+ ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshtein(text,text);
+ ALTER EXTENSION fuzzystrmatch ADD function dameraulevenshtein(text,text,integer,integer,integer,integer);
ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer);
ALTER EXTENSION fuzzystrmatch ADD function soundex(text);
ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text);
***************
*** 21,23 ****
--- 25,39 ----
CREATE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal_noncompatible (text,text,int,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs_noncompatible'
+ LANGUAGE C IMMUTABLE STRICT;
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql 1969-12-31 18:00:00.000000000 -0600
--- fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql 2013-06-11 04:09:01.000000000 -0500
***************
*** 0 ****
--- 1,33 ----
+ /* contrib/fuzzystrmatch/fuzzystrmatch--unpackaged--1.1.sql */
+
+ -- complain if script is sourced in psql, rather than via CREATE EXTENSION
+ \echo Use "CREATE EXTENSION fuzzystrmatch" to load this file. \quit
+
+ ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text);
+ ALTER EXTENSION fuzzystrmatch ADD function levenshtein(text,text,integer,integer,integer);
+ ALTER EXTENSION fuzzystrmatch ADD function levenshtein_less_equal (text,text,int);
+ ALTER EXTENSION fuzzystrmatch ADD function levenshtein_less_equal (text,text,int,int,int,int);
+ ALTER EXTENSION fuzzystrmatch ADD function metaphone(text,integer);
+ ALTER EXTENSION fuzzystrmatch ADD function soundex(text);
+ ALTER EXTENSION fuzzystrmatch ADD function text_soundex(text);
+ ALTER EXTENSION fuzzystrmatch ADD function difference(text,text);
+ ALTER EXTENSION fuzzystrmatch ADD function dmetaphone(text);
+ ALTER EXTENSION fuzzystrmatch ADD function dmetaphone_alt(text);
+
+ -- these functions were not in 9.3
+
+ CREATE FUNCTION dameraulevenshtein (text,text) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein (text,text,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal'
+ LANGUAGE C IMMUTABLE STRICT;
+
+ CREATE FUNCTION dameraulevenshtein_less_equal (text,text,int,int,int,int,int) RETURNS int
+ AS 'MODULE_PATHNAME','dameraulevenshtein_less_equal_with_costs'
+ LANGUAGE C IMMUTABLE STRICT;
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/levenshtein.c fuzzystrmatch/levenshtein.c
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/levenshtein.c 2013-02-03 11:09:42.000000000 -0600
--- fuzzystrmatch/levenshtein.c 2013-06-11 04:09:01.000000000 -0500
***************
*** 16,21 ****
--- 16,27 ----
* inspiration.
* Configurable penalty costs extension is introduced by Volkan
* YAZICI <[email protected]>.
+ * Damerau levenshtein variant
+ * Liming Hu <[email protected]>
+ * based on description of the algorithm:
+ * http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
+ * and:
+ * http://tomoyo.sourceforge.jp/cgi-bin/lxr/source/tools/perf/util/levenshtein.c
*/
/*
***************
*** 23,41 ****
*/
#ifdef LEVENSHTEIN_LESS_EQUAL
static int levenshtein_less_equal_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int max_d);
#else
static int levenshtein_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c);
#endif
#define MAX_LEVENSHTEIN_STRLEN 255
/*
! * Calculates Levenshtein distance metric between supplied strings. Generally
! * (1, 1, 1) penalty costs suffices for common cases, but your mileage may
! * vary.
*
* One way to compute Levenshtein distance is to incrementally construct
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
--- 29,52 ----
*/
#ifdef LEVENSHTEIN_LESS_EQUAL
static int levenshtein_less_equal_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c, int max_d);
! static int dameraulevenshtein_less_equal_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c, int max_d);
!
#else
static int levenshtein_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c);
! static int dameraulevenshtein_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c);
#endif
#define MAX_LEVENSHTEIN_STRLEN 255
/*
! * Calculates [Damerau ]Levenshtein distance metric between supplied strings. Generally
! * (1, 1, 1 [, 1]) penalty costs suffices for common cases, but your mileage
! * may vary.
*
* One way to compute Levenshtein distance is to incrementally construct
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
***************
*** 43,53 ****
* the first j characters of t. The last column of the final row is the
* answer.
*
! * We use that algorithm here with some modification. In lieu of holding
! * the entire array in memory at once, we'll just use two arrays of size
! * m+1 for storing accumulated values. At each step one array represents
! * the "previous" row and one is the "current" row of the notional large
! * array.
*
* If max_d >= 0, we only need to provide an accurate answer when that answer
* is less than or equal to the bound. From any cell in the matrix, there is
--- 54,75 ----
* the first j characters of t. The last column of the final row is the
* answer.
*
! * We use that algorithm here with some modification. In the case of Levenshtein
! * distance: in lieu of holding the entire array in memory at once, we'll just use
! * two arrays of size m+1 for storing accumulated values. At each step one array
! * represents the "previous" row and one is the "current" row of the notional large
! * array. In the case of Damerau-Levenshtein distance, to avoid a large space complexity,
! * only the last three rows are kept in memory (if swaps had the same or higher cost
! * as one deletion plus one insertion, only two rows would be needed).
! * At any stage, "i + 1" denotes the length of the current substring of
! * s_data that the distance is calculated for.
! *
! * row2 holds the current row, row1 the previous row (i.e. for the substring
! * of s_data of length "i"), and row0 the row before that.
! *
! * In other words, at the start of the big loop, row2[j + 1] contains the
! * Damerau-Levenshtein distance between the substring of s_data of length
! * "i" and the substring of t_data of length "j + 1".
*
* If max_d >= 0, we only need to provide an accurate answer when that answer
* is less than or equal to the bound. From any cell in the matrix, there is
***************
*** 66,77 ****
static int
#ifdef LEVENSHTEIN_LESS_EQUAL
levenshtein_less_equal_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int max_d)
#else
levenshtein_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c)
#endif
{
int m,
n,
s_bytes,
--- 88,112 ----
static int
#ifdef LEVENSHTEIN_LESS_EQUAL
levenshtein_less_equal_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c, int max_d)
#else
levenshtein_internal(text *s, text *t,
! int ins_c, int del_c, int sub_c, int trans_c)
#endif
{
+
+
+
+ #ifdef LEVENSHTEIN_LESS_EQUAL
+ if (trans_c!=0)
+ return dameraulevenshtein_less_equal_internal(s,t, ins_c, del_c, sub_c, trans_c, max_d);
+ #else
+ if (trans_c!=0)
+ return dameraulevenshtein_internal(s,t, ins_c, del_c, sub_c, trans_c);
+ #endif
+
+
+
int m,
n,
s_bytes,
***************
*** 275,280 ****
--- 310,316 ----
int ins;
int del;
int sub;
+ int trans;
int x_char_len = s_char_len[i - 1];
/*
***************
*** 310,315 ****
--- 346,352 ----
int ins;
int del;
int sub;
+ int trans;
/* Calculate costs for insertion, deletion, and substitution. */
ins = prev[i] + ins_c;
***************
*** 401,403 ****
--- 438,504 ----
*/
return prev[m - 1];
}
+
+ static int
+ #ifdef LEVENSHTEIN_LESS_EQUAL
+ dameraulevenshtein_less_equal_internal(text *s, text *t,
+ int ins_c, int del_c, int sub_c, int trans_c, int max_d)
+ #else
+ dameraulevenshtein_internal(text *s, text *t,
+ int ins_c, int del_c, int sub_c, int trans_c)
+ #endif
+ {
+
+ /* Extract a pointer to the actual character data. */
+ const char *s_data = VARDATA_ANY(s);
+ const char *t_data = VARDATA_ANY(t);
+
+ /* Determine length of each string in bytes and characters. */
+ int s_bytes = VARSIZE_ANY_EXHDR(s);
+ int t_bytes = VARSIZE_ANY_EXHDR(t);
+ /* returns the length (counted in wchars) of a multibyte string
+ * (not necessarily NULL terminated)
+ */
+ int length1 = pg_mbstrlen_with_len(s_data, s_bytes);
+ int length2 = pg_mbstrlen_with_len(t_data, t_bytes);
+
+ int *row0 = (int*) palloc(sizeof(int) * (length2 + 1)); /*the row before that.*/
+ int *row1 = (int*) palloc(sizeof(int) * (length2 + 1)); /*the previous row, for the substring of s_data of length i*/
+ int *row2 = (int*) palloc(sizeof(int) * (length2 + 1)); /*current row.*/
+ int i, j;
+ int distance = 0;
+
+ for (j = 0; j <= length2; j++) /*length2+1*/
+ row1[j] = j * ins_c;
+
+ for (i = 0; i < length1; i++) { /*length1: determines the partial minimum-cost paths.*/
+ int *dummy;
+
+ row2[0] = (i + 1) * del_c;
+ for (j = 0; j < length2; j++) { /*length2*/
+
+ row2[j + 1] = row1[j] + sub_c * (s_data[i] != t_data[j]); /* substitution */
+
+ if (i > 0 && j > 0 && s_data[i - 1] == t_data[j] &&
+ s_data[i] == t_data[j - 1] && /* transposition */
+ row2[j + 1] > row0[j - 1] + trans_c)
+ row2[j + 1] = row0[j - 1] + trans_c;
+
+ if (row2[j + 1] > row1[j + 1] + del_c) /* deletion */
+ row2[j + 1] = row1[j + 1] + del_c;
+
+ if (row2[j + 1] > row2[j] + ins_c) /* insertion */
+ row2[j + 1] = row2[j] + ins_c;
+ }
+
+ dummy = row0;
+ row0 = row1;
+ row1 = row2;
+ row2 = dummy;
+ }
+
+ distance = row1[length2];
+ return distance;
+ }
+
+
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/Makefile fuzzystrmatch/Makefile
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/Makefile 2011-05-06 01:09:32.000000000 -0500
--- fuzzystrmatch/Makefile 2013-06-11 04:09:01.000000000 -0500
***************
*** 4,10 ****
OBJS = fuzzystrmatch.o dmetaphone.o
EXTENSION = fuzzystrmatch
! DATA = fuzzystrmatch--1.0.sql fuzzystrmatch--unpackaged--1.0.sql
ifdef USE_PGXS
PG_CONFIG = pg_config
--- 4,10 ----
OBJS = fuzzystrmatch.o dmetaphone.o
EXTENSION = fuzzystrmatch
! DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--unpackaged--1.1.sql
ifdef USE_PGXS
PG_CONFIG = pg_config
diff -cNr /opt/src/pgsql-git/master/contrib/fuzzystrmatch/README.md fuzzystrmatch/README.md
*** /opt/src/pgsql-git/master/contrib/fuzzystrmatch/README.md 1969-12-31 18:00:00.000000000 -0600
--- fuzzystrmatch/README.md 2013-06-11 04:09:01.000000000 -0500
***************
*** 0 ****
--- 1,4 ----
+ fuzzystrmatch
+ =============
+
+ add Damerau-Levenshtein algorithm to fuzzystrmatch in PostgreSql
Levenshtein-Damerau.diff.sig
Description: PGP signature
-- Sent via pgsql-hackers mailing list ([email protected]) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers
