Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-08-05 Thread Tom Lane
I wrote:
 kar...@sao.ru (Sergey V. Karpov) writes:
 Andres Freund and...@anarazel.de writes:
 Looks nice. The only small gripe I have is that the patch adds trailing 
 whitespaces at a lot of places...

 My fault. Please check the patch version attached - I've tried to fix
 all those.

 I did some minor cleanup on this patch:

I've committed this version.

regards, tom lane

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-30 Thread Sergey V. Karpov
Andres Freund and...@anarazel.de writes:

Hi Andres,

 Looks nice. The only small gripe I have is that the patch adds trailing 
 whitespaces at a lot of places...

 Except maybe that I do see no need for changes anymore...

My fault. Please check the patch version attached - I've tried to fix
all those.

Thanks,
Sergey

Index: contrib/dict_xsyn/dict_xsyn.c
===
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v
retrieving revision 1.6
diff -u -r1.6 dict_xsyn.c
--- contrib/dict_xsyn/dict_xsyn.c	1 Jan 2009 17:23:32 -	1.6
+++ contrib/dict_xsyn/dict_xsyn.c	30 Jul 2009 07:37:54 -
@@ -26,6 +26,7 @@
 	char	   *key;			/* Word */
 	char	   *value;			/* Unparsed list of synonyms, including the
  * word itself */
+	int pos;/* Position of key word in original string */
 } Syn;
 
 typedef struct
@@ -33,7 +34,10 @@
 	int			len;
 	Syn		   *syn;
 
+	bool		matchorig;
 	bool		keeporig;
+	bool		matchsynonyms;
+	bool		keepsynonyms;
 } DictSyn;
 
 
@@ -88,6 +92,7 @@
 	{
 		char	   *value;
 		char	   *key;
+		char   *pos;
 		char	   *end = NULL;
 
 		if (*line == '\0')
@@ -96,26 +101,39 @@
 		value = lowerstr(line);
 		pfree(line);
 
-		key = find_word(value, end);
-		if (!key)
-		{
-			pfree(value);
-			continue;
-		}
+		pos = value;
 
-		if (cur == d-len)
+		while((key = find_word(pos, end)) != NULL)
 		{
-			d-len = (d-len  0) ? 2 * d-len : 16;
-			if (d-syn)
-d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
-			else
-d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
-		}
+			if (cur == d-len)
+			{
+d-len = (d-len  0) ? 2 * d-len : 16;
+if (d-syn)
+	d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
+else
+	d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
+			}
+
+			/* Read first word only if we will match it */
+			if (pos != value || d-matchorig)
+			{
+d-syn[cur].key = pnstrdup(key, end - key);
+d-syn[cur].value = pstrdup(value);
+d-syn[cur].pos = key - value;
 
-		d-syn[cur].key = pnstrdup(key, end - key);
-		d-syn[cur].value = value;
+cur++;
+			}
+
+			pos = end;
+
+			/* Don't read synonyms if we do not match them */
+			if (!d-matchsynonyms)
+			{
+break;
+			}
+		}
 
-		cur++;
+		pfree(value);
 	}
 
 	tsearch_readline_end(trst);
@@ -133,23 +151,40 @@
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSyn*d;
 	ListCell   *l;
+	char   *filename = NULL;
 
 	d = (DictSyn *) palloc0(sizeof(DictSyn));
 	d-len = 0;
 	d-syn = NULL;
+	d-matchorig = true;
 	d-keeporig = true;
+	d-matchsynonyms = false;
+	d-keepsynonyms = true;
 
 	foreach(l, dictoptions)
 	{
 		DefElem*defel = (DefElem *) lfirst(l);
 
-		if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
+		if (pg_strcasecmp(defel-defname, MATCHORIG) == 0)
+		{
+			d-matchorig = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
 		{
 			d-keeporig = defGetBoolean(defel);
 		}
+		else if (pg_strcasecmp(defel-defname, MATCHSYNONYMS) == 0)
+		{
+			d-matchsynonyms = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel-defname, KEEPSYNONYMS) == 0)
+		{
+			d-keepsynonyms = defGetBoolean(defel);
+		}
 		else if (pg_strcasecmp(defel-defname, RULES) == 0)
 		{
-			read_dictionary(d, defGetString(defel));
+			/* we can't read the rules before parsing all options! */
+			filename = pstrdup(defGetString(defel));
 		}
 		else
 		{
@@ -160,6 +195,12 @@
 		}
 	}
 
+	if(filename)
+	{
+		read_dictionary(d, filename);
+		pfree(filename);
+	}
+
 	PG_RETURN_POINTER(d);
 }
 
@@ -198,7 +239,6 @@
 		int			value_length = strlen(value);
 		char	   *pos = value;
 		int			nsyns = 0;
-		bool		is_first = true;
 
 		res = palloc(0);
 
@@ -214,8 +254,8 @@
 			res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
 			res[nsyns].lexeme = NULL;
 
-			/* first word is added to result only if KEEPORIG flag is set */
-			if (d-keeporig || !is_first)
+			/* The first word is added only if keeporig=true */
+			if (pos != value || d-keeporig)
 			{
 res[nsyns].lexeme = pstrdup(syn);
 res[nsyns + 1].lexeme = NULL;
@@ -223,9 +263,12 @@
 nsyns++;
 			}
 
-			is_first = false;
-
 			pos = end + 1;
+
+			if(!d-keepsynonyms)
+			{
+break;
+			}
 		}
 
 		pfree(value);
Index: contrib/dict_xsyn/expected/dict_xsyn.out
===
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/expected/dict_xsyn.out,v
retrieving revision 1.1
diff -u -r1.1 dict_xsyn.out
--- contrib/dict_xsyn/expected/dict_xsyn.out	15 Oct 2007 21:36:50 -	1.1
+++ contrib/dict_xsyn/expected/dict_xsyn.out	30 Jul 2009 07:37:54 -
@@ -5,10 +5,76 @@
 SET client_min_messages = warning;
 \set ECHO none
 RESET client_min_messages;
---configuration
-ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
+-- default configuration - match first word and return it among with all synonyms
+ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', 

Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-30 Thread Tom Lane
kar...@sao.ru (Sergey V. Karpov) writes:
 Andres Freund and...@anarazel.de writes:
 Looks nice. The only small gripe I have is that the patch adds trailing 
 whitespaces at a lot of places...

 My fault. Please check the patch version attached - I've tried to fix
 all those.

I did some minor cleanup on this patch:
* make the two parsing loops less confusingly different
* remove unused 'pos' field of Syn
* avoid some unnecessary pallocs
* improve the comments and docs a bit

I think it's ready for committer too, but the committer I have in mind
is Teodor --- he's the ultimate expert on tsearch stuff.  Teodor, have
you got time to look this over and commit it?

regards, tom lane



bin6EDiXbmqbR.bin
Description: dict_xsyn_3.patch.gz

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-29 Thread Andres Freund
Hi Sergey,

Sorry that the second round took almost as long as the first one...

On Monday 27 July 2009 12:01:46 Sergey V. Karpov wrote:
  - Imho mode=MAP should error out if keeporig is false
  - I personally find the the names for the different modes a bit
  nondescriptive. One possibility would be to introduce parameters like:
  - matchorig
  - matchsynonym
  - keeporig
  - keepsynonym
  That sounds way much easier to grasp for me.
 Yes, I agree. In such a way user has the complete (and more
 straightforward) control over the dictionary behaviour.

 Here is the revised patch version, with following options:

  * matchorig controls whether the original word is accepted by the
dictionary. Default is true.

  * keeporig controls whether the original word is included (if true)
in results, or only its synonyms (if false). Default is true.

  * matchsynonyms controls whether any of the synonyms is accepted by
the dictionary (if true). Default is false.

  * keepsynonyms controls whether synonyms are returned by the
dictionary (if true). Default is true.

 Defaults are set to keep default behaviour compatible with original
 version.
Looks nice. The only small gripe I have is that the patch adds trailing 
whitespaces at a lot of places...

Except maybe that I do see no need for changes anymore...

Andres

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers


Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-29 Thread Robert Haas
On Wed, Jul 29, 2009 at 6:59 PM, Andres Freundand...@anarazel.de wrote:
 Looks nice. The only small gripe I have is that the patch adds trailing
 whitespaces at a lot of places...

 Except maybe that I do see no need for changes anymore...

I have fixed this for Sergey in the attached version using git apply
--whitespace=fix.  (For those who may be using git to develop
patches, I highly recommend git --check to catch these types of issues
before submitting.)

I will mark this Ready for Committer.

...Robert
*** a/contrib/dict_xsyn/dict_xsyn.c
--- b/contrib/dict_xsyn/dict_xsyn.c
***
*** 26,31  typedef struct
--- 26,32 
  	char	   *key;			/* Word */
  	char	   *value;			/* Unparsed list of synonyms, including the
   * word itself */
+ 	int pos;/* Position of key word in original string */
  } Syn;
  
  typedef struct
***
*** 33,39  typedef struct
--- 34,44 
  	int			len;
  	Syn		   *syn;
  
+ 	bool		matchorig;
  	bool		keeporig;
+ 	bool		matchsynonyms;
+ 	bool		keepsynonyms;
+ 
  } DictSyn;
  
  
***
*** 88,93  read_dictionary(DictSyn *d, char *filename)
--- 93,99 
  	{
  		char	   *value;
  		char	   *key;
+ 		char   *pos;
  		char	   *end = NULL;
  
  		if (*line == '\0')
***
*** 96,121  read_dictionary(DictSyn *d, char *filename)
  		value = lowerstr(line);
  		pfree(line);
  
! 		key = find_word(value, end);
! 		if (!key)
! 		{
! 			pfree(value);
! 			continue;
! 		}
  
! 		if (cur == d-len)
  		{
! 			d-len = (d-len  0) ? 2 * d-len : 16;
! 			if (d-syn)
! d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
! 			else
! d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
! 		}
  
! 		d-syn[cur].key = pnstrdup(key, end - key);
! 		d-syn[cur].value = value;
  
! 		cur++;
  	}
  
  	tsearch_readline_end(trst);
--- 102,140 
  		value = lowerstr(line);
  		pfree(line);
  
! 		pos = value;
  
! 		while((key = find_word(pos, end)) != NULL)
  		{
! 			if (cur == d-len)
! 			{
! d-len = (d-len  0) ? 2 * d-len : 16;
! if (d-syn)
! 	d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
! else
! 	d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
! 			}
! 
! 			/* Read first word only if we will match it */
! 			if (pos != value || d-matchorig)
! 			{
! d-syn[cur].key = pnstrdup(key, end - key);
! d-syn[cur].value = pstrdup(value);
! d-syn[cur].pos = key - value;
! 
! cur++;
! 			}
! 
! 			pos = end;
  
! 			/* Don't read synonyms if we do not match them */
! 			if (!d-matchsynonyms)
! 			{
! break;
! 			}
! 		}
  
! 		pfree(value);
  	}
  
  	tsearch_readline_end(trst);
***
*** 133,155  dxsyn_init(PG_FUNCTION_ARGS)
  	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
  	DictSyn*d;
  	ListCell   *l;
  
  	d = (DictSyn *) palloc0(sizeof(DictSyn));
  	d-len = 0;
  	d-syn = NULL;
  	d-keeporig = true;
  
  	foreach(l, dictoptions)
  	{
  		DefElem*defel = (DefElem *) lfirst(l);
  
! 		if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
  		{
  			d-keeporig = defGetBoolean(defel);
  		}
  		else if (pg_strcasecmp(defel-defname, RULES) == 0)
  		{
! 			read_dictionary(d, defGetString(defel));
  		}
  		else
  		{
--- 152,191 
  	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
  	DictSyn*d;
  	ListCell   *l;
+ 	char   *filename = NULL;
  
  	d = (DictSyn *) palloc0(sizeof(DictSyn));
  	d-len = 0;
  	d-syn = NULL;
+ 	d-matchorig = true;
  	d-keeporig = true;
+ 	d-matchsynonyms = false;
+ 	d-keepsynonyms = true;
  
  	foreach(l, dictoptions)
  	{
  		DefElem*defel = (DefElem *) lfirst(l);
  
! 		if (pg_strcasecmp(defel-defname, MATCHORIG) == 0)
! 		{
! 			d-matchorig = defGetBoolean(defel);
! 		}
! 		else if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
  		{
  			d-keeporig = defGetBoolean(defel);
  		}
+ 		else if (pg_strcasecmp(defel-defname, MATCHSYNONYMS) == 0)
+ 		{
+ 			d-matchsynonyms = defGetBoolean(defel);
+ 		}
+ 		else if (pg_strcasecmp(defel-defname, KEEPSYNONYMS) == 0)
+ 		{
+ 			d-keepsynonyms = defGetBoolean(defel);
+ 		}
  		else if (pg_strcasecmp(defel-defname, RULES) == 0)
  		{
! 			/* we can't read the rules before parsing all options! */
! 			filename = pstrdup(defGetString(defel));
  		}
  		else
  		{
***
*** 160,165  dxsyn_init(PG_FUNCTION_ARGS)
--- 196,207 
  		}
  	}
  
+ 	if(filename)
+ 	{
+ 		read_dictionary(d, filename);
+ 		pfree(filename);
+ 	}
+ 
  	PG_RETURN_POINTER(d);
  }
  
***
*** 198,204  dxsyn_lexize(PG_FUNCTION_ARGS)
  		int			value_length = strlen(value);
  		char	   *pos = value;
  		int			nsyns = 0;
- 		bool		is_first = true;
  
  		res = palloc(0);
  
--- 240,245 
***
*** 214,221  dxsyn_lexize(PG_FUNCTION_ARGS)
  			res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
  			res[nsyns].lexeme = NULL;
  
! 			/* first word is added to result only if KEEPORIG flag is set */
! 			if (d-keeporig || !is_first)
  			{
  

Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-27 Thread Sergey V. Karpov
Andres Freund and...@anarazel.de writes:

Hi Andres,

Thank you for review of my patch.

 Some points:
 - Patch looks generally sound
 - lacks a bit of a motivational statement, even though one can imagine uses

The patch has initially been motivated by the request in pgsql-general
(http://archives.postgresql.org/pgsql-general/2009-02/msg00102.php).

 - Imho mode=MAP should error out if keeporig is false
 - I personally find the the names for the different modes a bit 
 nondescriptive.
   One possibility would be to introduce parameters like:
   - matchorig
   - matchsynonym
   - keeporig
   - keepsynonym
 That sounds way much easier to grasp for me.

Yes, I agree. In such a way user has the complete (and more straightforward)
control over the dictionary behaviour.

Here is the revised patch version, with following options:

 * matchorig controls whether the original word is accepted by the
   dictionary. Default is true.

 * keeporig controls whether the original word is included (if true)
   in results, or only its synonyms (if false). Default is true.

 * matchsynonyms controls whether any of the synonyms is accepted by
   the dictionary (if true). Default is false.

 * keepsynonyms controls whether synonyms are returned by the
   dictionary (if true). Default is true.

Defaults are set to keep default behaviour compatible with original version.

Thanks,
Sergey

Index: contrib/dict_xsyn/dict_xsyn.c
===
RCS file: /projects/cvsroot/pgsql/contrib/dict_xsyn/dict_xsyn.c,v
retrieving revision 1.6
diff -u -r1.6 dict_xsyn.c
--- contrib/dict_xsyn/dict_xsyn.c	1 Jan 2009 17:23:32 -	1.6
+++ contrib/dict_xsyn/dict_xsyn.c	27 Jul 2009 09:51:52 -
@@ -26,6 +26,7 @@
 	char	   *key;			/* Word */
 	char	   *value;			/* Unparsed list of synonyms, including the
  * word itself */
+	int pos;/* Position of key word in original string */
 } Syn;
 
 typedef struct
@@ -33,7 +34,11 @@
 	int			len;
 	Syn		   *syn;
 
+	bool		matchorig;
 	bool		keeporig;
+	bool		matchsynonyms;
+	bool		keepsynonyms;
+	
 } DictSyn;
 
 
@@ -88,6 +93,7 @@
 	{
 		char	   *value;
 		char	   *key;
+		char   *pos;
 		char	   *end = NULL;
 
 		if (*line == '\0')
@@ -96,26 +102,39 @@
 		value = lowerstr(line);
 		pfree(line);
 
-		key = find_word(value, end);
-		if (!key)
-		{
-			pfree(value);
-			continue;
-		}
+		pos = value;
 
-		if (cur == d-len)
+		while((key = find_word(pos, end)) != NULL)
 		{
-			d-len = (d-len  0) ? 2 * d-len : 16;
-			if (d-syn)
-d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
-			else
-d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
-		}
+			if (cur == d-len)
+			{
+d-len = (d-len  0) ? 2 * d-len : 16;
+if (d-syn)
+	d-syn = (Syn *) repalloc(d-syn, sizeof(Syn) * d-len);
+else
+	d-syn = (Syn *) palloc(sizeof(Syn) * d-len);
+			}
 
-		d-syn[cur].key = pnstrdup(key, end - key);
-		d-syn[cur].value = value;
+			/* Read first word only if we will match it */
+			if (pos != value || d-matchorig)
+			{
+d-syn[cur].key = pnstrdup(key, end - key);
+d-syn[cur].value = pstrdup(value);
+d-syn[cur].pos = key - value;
+			
+cur++;
+			}
 
-		cur++;
+			pos = end;
+
+			/* Don't read synonyms if we do not match them */
+			if (!d-matchsynonyms)
+			{
+break;
+			}
+		}
+
+		pfree(value);
 	}
 
 	tsearch_readline_end(trst);
@@ -133,23 +152,40 @@
 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 	DictSyn*d;
 	ListCell   *l;
-
+	char   *filename = NULL;
+	
 	d = (DictSyn *) palloc0(sizeof(DictSyn));
 	d-len = 0;
 	d-syn = NULL;
+	d-matchorig = true;
 	d-keeporig = true;
-
+	d-matchsynonyms = false;
+	d-keepsynonyms = true;
+	
 	foreach(l, dictoptions)
 	{
 		DefElem*defel = (DefElem *) lfirst(l);
 
-		if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
+		if (pg_strcasecmp(defel-defname, MATCHORIG) == 0)
+		{
+			d-matchorig = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel-defname, KEEPORIG) == 0)
 		{
 			d-keeporig = defGetBoolean(defel);
 		}
+		else if (pg_strcasecmp(defel-defname, MATCHSYNONYMS) == 0)
+		{
+			d-matchsynonyms = defGetBoolean(defel);
+		}
+		else if (pg_strcasecmp(defel-defname, KEEPSYNONYMS) == 0)
+		{
+			d-keepsynonyms = defGetBoolean(defel);
+		}
 		else if (pg_strcasecmp(defel-defname, RULES) == 0)
 		{
-			read_dictionary(d, defGetString(defel));
+			/* we can't read the rules before parsing all options! */
+			filename = pstrdup(defGetString(defel));
 		}
 		else
 		{
@@ -160,6 +196,12 @@
 		}
 	}
 
+	if(filename)
+	{
+		read_dictionary(d, filename);
+		pfree(filename);
+	}
+	
 	PG_RETURN_POINTER(d);
 }
 
@@ -198,7 +240,6 @@
 		int			value_length = strlen(value);
 		char	   *pos = value;
 		int			nsyns = 0;
-		bool		is_first = true;
 
 		res = palloc(0);
 
@@ -214,8 +255,8 @@
 			res = repalloc(res, sizeof(TSLexeme) * (nsyns + 2));
 			res[nsyns].lexeme = NULL;
 
-			/* first word is added to 

Re: [HACKERS] improvements for dict_xsyn extended synonym dictionary - RRR

2009-07-25 Thread Andres Freund
Hi Sergey,

On Tuesday 14 July 2009 21:35:28 Sergey V. Karpov wrote:
 attached is a simple patch that extends the functionality of dict_xsyn
 extended synonym dictionary (from contrib) by adding the following
 configuration option:

 - mode option controls the current dictionary mode of operation. Can be
 one of:

   - in simple mode it accepts the original word and returns all synonyms
 as ORed lis.

   - when mode is symmetric, the dictionary accepts the original word or
 any of its synonyms, and return all others as ORed list.

   - in map regime it accepts any synonym and returns the original word
 instead of it. Also, it accepts and returns the original word
 itself, even if keeporig is false.
Some points:
- Patch looks generally sound
- lacks a bit of a motivational statement, even though one can imagine uses
- Imho mode=MAP should error out if keeporig is false
- I personally find the the names for the different modes a bit nondescriptive.
  One possibility would be to introduce parameters like:
- matchorig
- matchsynonym
- keeporig
- keepsynonym
That sounds way much easier to grasp for me.

Comments?

Andres

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers