Tom Lane wrote:
> Any portion of a binary value that is considered textual should be
> converted to and from client encoding --- cf textsend/textrecv.
> This should be pretty trivial to fix, just call a different support
> routine.

You do need to adjust length and position fields in the structs as well.
I fixed (rewrote, almost) the send/recv functions, and added a comment
above them describing the on-wire format. The CRC is now recalculated in
tsquery as well per previous discussion.

Patch attached. This is on top of the previous patches I sent. It
includes some additional changes that I had already started with. Most
notably:

- change the alignment requirement of lexemes in TSVector slightly.
Lexeme strings were always padded to 2-byte aligned length to make sure
that if there's position array (uint16[]) it has the right alignment.
The patch changes that so that the padding is not done when there's no
positions. That makes the storage of tsvectors without positions
slightly more compact.

- added some #include "miscadmin.h" lines I missed in the earlier when I
added calls to check_stack_depth().


BTW, the encoding of the XML datatype looks pretty funky. xml_recv first
reads the xml string with pq_getmsgtext, which applies a client->server
conversion. Then the xml declaration is parsed, extracting the encoding
attribute. Then the string is converted again from that encoding (or
UTF-8 if none was specified) to server encoding. I don't understand how
it's supposed to work, but ISTM there's one conversion too much,

> BTW, Teodor, are you intending to review/apply Heikki's tsearch fixes,
> or do you want someone else to do it?

I am getting confused with the patches and version I have lying around
here... I think I'll have to wait for review of the patches I've posted
this far before I continue hacking.

-- 
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsginidx.c	2007-09-06 11:19:57.000000000 +0100
--- ./src/backend/utils/adt/tsginidx.c	2007-09-07 09:20:27.000000000 +0100
***************
*** 22,28 ****
  gin_extract_tsvector(PG_FUNCTION_ARGS)
  {
  	TSVector	vector = PG_GETARG_TSVECTOR(0);
! 	uint32	   *nentries = (uint32 *) PG_GETARG_POINTER(1);
  	Datum	   *entries = NULL;
  
  	*nentries = vector->size;
--- 22,28 ----
  gin_extract_tsvector(PG_FUNCTION_ARGS)
  {
  	TSVector	vector = PG_GETARG_TSVECTOR(0);
! 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
  	Datum	   *entries = NULL;
  
  	*nentries = vector->size;
***************
*** 54,60 ****
  gin_extract_query(PG_FUNCTION_ARGS)
  {
  	TSQuery		query = PG_GETARG_TSQUERY(0);
! 	uint32	   *nentries = (uint32 *) PG_GETARG_POINTER(1);
  	StrategyNumber strategy = PG_GETARG_UINT16(2);
  	Datum	   *entries = NULL;
  
--- 54,60 ----
  gin_extract_query(PG_FUNCTION_ARGS)
  {
  	TSQuery		query = PG_GETARG_TSQUERY(0);
! 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
  	StrategyNumber strategy = PG_GETARG_UINT16(2);
  	Datum	   *entries = NULL;
  
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery.c	2007-09-05 11:59:09.000000000 +0100
--- ./src/backend/utils/adt/tsquery.c	2007-09-07 09:35:18.000000000 +0100
***************
*** 21,27 ****
  #include "tsearch/ts_utils.h"
  #include "utils/memutils.h"
  #include "utils/pg_crc.h"
- #include "nodes/bitmapset.h"
  
  
  struct TSQueryParserStateData
--- 21,26 ----
***************
*** 384,399 ****
  	}
  }
  
- /*
-  * Fills in the left-fields previously left unfilled. The input
-  * QueryItems must be in polish (prefix) notation. 
-  */
  static void
! findoprnd(QueryItem *ptr, uint32 *pos)
  {
  	/* since this function recurses, it could be driven to stack overflow. */
  	check_stack_depth();
  
  	if (ptr[*pos].type == QI_VAL ||
  		ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here,
  									   * they haven't been cleansed
--- 383,397 ----
  	}
  }
  
  static void
! findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
  {
  	/* since this function recurses, it could be driven to stack overflow. */
  	check_stack_depth();
  
+ 	if (*pos >= nnodes)
+ 		elog(ERROR, "malformed tsquery; operand not found");
+ 
  	if (ptr[*pos].type == QI_VAL ||
  		ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here,
  									   * they haven't been cleansed
***************
*** 410,416 ****
  		{
  			ptr[*pos].operator.left = 1;
  			(*pos)++;
! 			findoprnd(ptr, pos);
  		}
  		else
  		{
--- 408,414 ----
  		{
  			ptr[*pos].operator.left = 1;
  			(*pos)++;
! 			findoprnd_recurse(ptr, pos, nnodes);
  		}
  		else
  		{
***************
*** 420,432 ****
  			Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
  
  			(*pos)++;
! 			findoprnd(ptr, pos);
  			curitem->left = *pos - tmp;
! 			findoprnd(ptr, pos);
  		}
  	}
  }
  
  /*
   * Each value (operand) in the query is be passed to pushval. pushval can
   * transform the simple value to an arbitrarily complex expression using
--- 418,448 ----
  			Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);
  
  			(*pos)++;
! 			findoprnd_recurse(ptr, pos, nnodes);
  			curitem->left = *pos - tmp;
! 			findoprnd_recurse(ptr, pos, nnodes);
  		}
  	}
  }
  
+ 
+ /*
+  * Fills in the left-fields previously left unfilled. The input
+  * QueryItems must be in polish (prefix) notation. 
+  */
+ static void
+ findoprnd(QueryItem *ptr, int size)
+ {
+ 	uint32 pos;
+ 
+ 	pos = 0;
+ 	findoprnd_recurse(ptr, &pos, size);
+ 
+ 	if (pos != size)
+ 		elog(ERROR, "malformed tsquery; extra nodes");
+ }
+ 
+ 
  /*
   * Each value (operand) in the query is be passed to pushval. pushval can
   * transform the simple value to an arbitrarily complex expression using
***************
*** 452,458 ****
  	TSQuery		query;
  	int			commonlen;
  	QueryItem  *ptr;
- 	uint32		pos = 0;
  	ListCell   *cell;
  
  	/* init state */
--- 468,473 ----
***************
*** 522,529 ****
  	pfree(state.op);
  
  	/* Set left operand pointers for every operator. */
! 	pos = 0;
! 	findoprnd(ptr, &pos);
  
  	return query;
  }
--- 537,543 ----
  	pfree(state.op);
  
  	/* Set left operand pointers for every operator. */
! 	findoprnd(ptr, query->size);
  
  	return query;
  }
***************
*** 734,739 ****
--- 748,769 ----
  	PG_RETURN_CSTRING(nrm.buf);
  }
  
+ /*
+  * Binary Input / Output functions. The binary format is as follows:
+  *
+  * uint32	 number of operators/operands in the query
+  * 
+  * Followed by the operators and operands, in prefix notation. For each
+  * operand:
+  *
+  * uint8	type, QI_VAL
+  * uint8	weight
+  * 			operand text in client encoding, null-terminated
+  *
+  * For each operator:
+  * uint8	type, QI_OPR
+  * uint8	operator, one of OP_AND, OP_OR, OP_NOT.
+  */
  Datum
  tsquerysend(PG_FUNCTION_ARGS)
  {
***************
*** 744,750 ****
  
  	pq_begintypsend(&buf);
  
! 	pq_sendint(&buf, query->size, sizeof(int32));
  	for (i = 0; i < query->size; i++)
  	{
  		pq_sendint(&buf, item->type, sizeof(item->type));
--- 774,780 ----
  
  	pq_begintypsend(&buf);
  
! 	pq_sendint(&buf, query->size, sizeof(uint32));
  	for (i = 0; i < query->size; i++)
  	{
  		pq_sendint(&buf, item->type, sizeof(item->type));
***************
*** 752,767 ****
  		switch(item->type)
  		{
  			case QI_VAL:
! 				pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight));
! 				pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc));
! 				pq_sendint(&buf, item->operand.length, sizeof(int16));
  				/* istrue flag is just for temporary use in tsrank.c/Cover,
  				 * so we don't need to transfer that */
  				break;
  			case QI_OPR:
  				pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
- 				if (item->operator.oper != OP_NOT)
- 					pq_sendint(&buf, item->operator.left, sizeof(item->operator.left));
  				break;
  			default:
  				elog(ERROR, "unknown tsquery node type %d", item->type);
--- 782,794 ----
  		switch(item->type)
  		{
  			case QI_VAL:
! 				pq_sendint(&buf, item->operand.weight, sizeof(uint8));
! 				pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
  				/* istrue flag is just for temporary use in tsrank.c/Cover,
  				 * so we don't need to transfer that */
  				break;
  			case QI_OPR:
  				pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
  				break;
  			default:
  				elog(ERROR, "unknown tsquery node type %d", item->type);
***************
*** 769,782 ****
  		item++;
  	}
  
- 	item = GETQUERY(query);
- 	for (i = 0; i < query->size; i++)
- 	{
- 		if (item->type == QI_VAL)
- 			pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length);
- 		item++;
- 	}
- 
  	PG_FREE_IF_COPY(query, 0);
  
  	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
--- 796,801 ----
***************
*** 788,924 ****
  	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
  	TSQuery		query;
  	int			i,
- 				size,
  				len;
  	QueryItem  *item;
! 	int			datalen = 0;
  	char	   *ptr;
! 	Bitmapset  *parentset = NULL;
  
  	size = pq_getmsgint(buf, sizeof(uint32));
! 	if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem)))
  		elog(ERROR, "invalid size of tsquery");
  
! 	len = HDRSIZETQ + sizeof(QueryItem) * size;
  
! 	query = (TSQuery) palloc(len);
  	query->size = size;
  	item = GETQUERY(query);
  
  	for (i = 0; i < size; i++)
  	{
  		item->type = (int8) pq_getmsgint(buf, sizeof(int8));
  
! 		switch(item->type)
  		{
! 			case QI_VAL:
! 				item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8));
! 				item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32));
! 				item->operand.length = pq_getmsgint(buf, sizeof(int16));
! 
! 				/* Check that the weight bitmap is valid */
! 				if (item->operand.weight < 0 || item->operand.weight > 0xF)
! 					elog(ERROR, "invalid weight bitmap");
! 
! 				/* XXX: We don't check that the CRC is valid. Actually, if we
! 				 * bothered to calculate it to verify, there would be no need
! 				 * to transfer it.
! 				 */
! 
! 				/*
! 				 * Check that datalen doesn't grow too large. Without the
! 				 * check, a malicious client could induce a buffer overflow
! 				 * by sending a tsquery whose size exceeds 2GB. datalen
! 				 * would overflow, we would allocate a too small buffer below,
! 				 * and overflow the buffer. Because operand.length is a 20-bit
! 				 * field, adding one such value to datalen must exceed
! 				 * MaxAllocSize before wrapping over the 32-bit datalen field,
! 				 * so this check will protect from it.
! 				 */
! 				if (datalen > MAXSTRLEN)
! 					elog(ERROR, "invalid tsquery; total operand length exceeded");
! 
! 				/* We can calculate distance from datalen, no need to send it
! 				 * across the wire. If we did, we would have to check that
! 				 * it's valid anyway.
! 				 */
! 				item->operand.distance = datalen;
  
! 				datalen += item->operand.length + 1;		/* \0 */
! 
! 				break;
! 			case QI_OPR:
! 				item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8));
! 				if (item->operator.oper != OP_NOT &&
! 					item->operator.oper != OP_OR &&
! 					item->operator.oper != OP_AND)
! 					elog(ERROR, "unknown operator type %d", (int) item->operator.oper);
! 
! 				/*
! 				 * Check that no previous operator node points to the right
! 				 * operand. That would mean that the operand node
! 				 * has two parents.
! 				 */
! 				if (bms_is_member(i + 1, parentset))
! 					elog(ERROR, "malformed query tree");
! 
! 				parentset = bms_add_member(parentset, i + 1);
! 
! 				if(item->operator.oper != OP_NOT)
! 				{
! 					uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32));
! 
! 					/*
! 					 * Right operand is implicitly at "this+1". Don't allow
! 					 * left to point to the right operand, or to self.
! 					 */
! 					if (left <= 1 || i + left >= size)
! 						elog(ERROR, "invalid pointer to left operand");
! 
! 					/*
! 					 * Check that no previous operator node points to the left
! 					 * operand.
! 					 */
! 					if (bms_is_member(i + left, parentset))
! 						elog(ERROR, "malformed query tree");
! 
! 					parentset = bms_add_member(parentset, i + left);
! 
! 					item->operator.left = left;
! 				}
! 
! 				if (i == size - 1)
! 					elog(ERROR, "invalid pointer to right operand");
! 				break;
! 			default:
! 				elog(ERROR, "unknown tsquery node type %d", item->type);
  		}
  
  		item++;
  	}
  
! 	/* Now check that each node, except the root, has a parent. We
! 	 * already checked above that no node has more than one parent. */
! 	if (bms_num_members(parentset) != size - 1 && size != 0)
! 		elog(ERROR, "malformed query tree");
! 
  	query = (TSQuery) repalloc(query, len + datalen);
- 
  	item = GETQUERY(query);
  	ptr = GETOPERAND(query);
  	for (i = 0; i < size; i++)
  	{
  		if (item->type == QI_VAL)
  		{
! 			memcpy(ptr,
! 				   pq_getmsgbytes(buf, item->operand.length),
! 				   item->operand.length);
! 			ptr += item->operand.length;
! 			*ptr++ = '\0';
  		}
  		item++;
  	}
  
  	Assert(ptr - GETOPERAND(query) == datalen);
  
  	SET_VARSIZE(query, len + datalen);
--- 807,919 ----
  	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
  	TSQuery		query;
  	int			i,
  				len;
  	QueryItem  *item;
! 	int			datalen;
  	char	   *ptr;
! 	uint32		size;
! 	const char **operands;
  
  	size = pq_getmsgint(buf, sizeof(uint32));
! 	if (size > (MaxAllocSize / sizeof(QueryItem)))
  		elog(ERROR, "invalid size of tsquery");
  
! 	/* Allocate space to temporarily hold operand strings */
! 	operands = palloc(size * sizeof(char *));
  
! 	/* Allocate space for all the QueryItems. */
! 	len = HDRSIZETQ + sizeof(QueryItem) * size;
! 	query = (TSQuery) palloc0(len);
  	query->size = size;
  	item = GETQUERY(query);
  
+ 	datalen = 0;
  	for (i = 0; i < size; i++)
  	{
  		item->type = (int8) pq_getmsgint(buf, sizeof(int8));
  
! 		if (item->type == QI_VAL)
  		{
! 			size_t val_len; /* length after recoding to server encoding */
! 			uint8 weight;
! 			const char *val;
! 			pg_crc32 valcrc;
! 
! 			weight 	 = (uint8) pq_getmsgint(buf, sizeof(uint8));
! 			val = pq_getmsgstring(buf);
! 			val_len = strlen(val);
! 
! 			/* Sanity checks */
! 
! 			if (weight > 0xF)
! 				elog(ERROR, "invalid tsquery; invalid weight bitmap");
! 
! 			if (val_len > MAXSTRLEN)
! 				elog(ERROR, "invalid tsquery; operand too long");
! 				
! 			if (datalen > MAXSTRPOS)
! 				elog(ERROR, "invalid tsquery; total operand length exceeded");
! 
! 			/* Looks valid. */
! 
! 			INIT_CRC32(valcrc);
! 			COMP_CRC32(valcrc, val, val_len);
! 			FIN_CRC32(valcrc);
! 
! 			item->operand.weight = weight;
! 			item->operand.valcrc = (int32) valcrc;
! 			item->operand.length = val_len;
! 			item->operand.distance = datalen;
! 
! 			/* 
! 			 * Operand strings are copied to the final struct after this loop;
! 			 * here we just collect them to an array
! 			 */
! 			operands[i] = val;
! 
! 			datalen += val_len + 1;		/* + 1 for the '\0' terminator */
! 		} 
! 		else if (item->type == QI_OPR)
! 		{
! 			int8 oper;
! 			oper = (int8) pq_getmsgint(buf, sizeof(int8));
! 			if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
! 				elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper);
! 			if (i == size - 1)
! 				elog(ERROR, "invalid pointer to right operand");
  
! 			item->operator.oper = oper;
  		}
+ 		else 
+ 			elog(ERROR, "unknown tsquery node type %d", item->type);
  
  		item++;
  	}
  
! 	/* Enlarge buffer to make room for the operand values. */
  	query = (TSQuery) repalloc(query, len + datalen);
  	item = GETQUERY(query);
  	ptr = GETOPERAND(query);
+ 
+ 	/* 
+ 	 * Fill in the left-pointers. Checks that the tree is well-formed
+ 	 * as a side-effect.
+ 	 */
+ 	findoprnd(item, size);
+ 
+ 	/* Copy operands to output struct */
  	for (i = 0; i < size; i++)
  	{
  		if (item->type == QI_VAL)
  		{
! 			memcpy(ptr, operands[i], item->operand.length + 1);
! 			ptr += item->operand.length + 1;
  		}
  		item++;
  	}
  
+ 	pfree(operands);
+ 
  	Assert(ptr - GETOPERAND(query) == datalen);
  
  	SET_VARSIZE(query, len + datalen);
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_cleanup.c	2007-09-05 12:14:43.000000000 +0100
--- ./src/backend/utils/adt/tsquery_cleanup.c	2007-09-07 09:35:48.000000000 +0100
***************
*** 17,22 ****
--- 17,23 ----
  
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+ #include "miscadmin.h"
  
  typedef struct NODE
  {
diff -r -c -x '*.o' -x '*.Po' -x config.log -x '*.so' -x CVS -x gram.c ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_rewrite.c ./src/backend/utils/adt/tsquery_rewrite.c
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_rewrite.c	2007-09-05 12:18:46.000000000 +0100
--- ./src/backend/utils/adt/tsquery_rewrite.c	2007-09-06 23:25:00.000000000 +0100
***************
*** 17,22 ****
--- 17,23 ----
  #include "executor/spi.h"
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+ #include "miscadmin.h"
  
  
  static int
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsquery_util.c	2007-09-05 12:21:29.000000000 +0100
--- ./src/backend/utils/adt/tsquery_util.c	2007-09-06 23:25:14.000000000 +0100
***************
*** 16,21 ****
--- 16,22 ----
  
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
+ #include "miscadmin.h"
  
  QTNode *
  QT2QTN(QueryItem * in, char *operand)
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsrank.c	2007-09-05 12:24:27.000000000 +0100
--- ./src/backend/utils/adt/tsrank.c	2007-09-06 23:56:29.000000000 +0100
***************
*** 18,23 ****
--- 18,24 ----
  #include "tsearch/ts_type.h"
  #include "tsearch/ts_utils.h"
  #include "utils/array.h"
+ #include "miscadmin.h"
  
  
  static float weights[] = {0.1, 0.2, 0.4, 1.0};
***************
*** 176,183 ****
  	return res;
  }
  
  static WordEntryPos POSNULL[] = {
! 	0,
  	0
  };
  
--- 177,185 ----
  	return res;
  }
  
+ /* A dummy WordEntryPos array to use when haspos is false */
  static WordEntryPos POSNULL[] = {
! 	1, /* Number of elements that follow */
  	0
  };
  
***************
*** 207,213 ****
  	}
  	pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
  	memset(pos, 0, sizeof(uint16 *) * q->size);
- 	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
  	WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);
  
  	for (i = 0; i < size; i++)
--- 209,214 ----
***************
*** 265,271 ****
  	QueryOperand **item;
  	int			size = q->size;
  
- 	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
  	item = SortAndUniqItems(q, &size);
  
  	for (i = 0; i < size; i++)
--- 266,271 ----
***************
*** 593,599 ****
  	DocRepresentation *doc;
  	char	   *operand;
  
- 	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
  	doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
  	operand = GETOPERAND(query);
  	reset_istrue_flag(query);
--- 593,598 ----
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsvector.c	2007-09-03 11:05:31.000000000 +0100
--- ./src/backend/utils/adt/tsvector.c	2007-09-07 09:47:46.000000000 +0100
***************
*** 75,92 ****
  }
  
  static int
! compareentry(const void *a, const void *b, void *arg)
  {
  	char	   *BufferStr = (char *) arg;
  
! 	if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
  	{
! 		return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
! 					   &BufferStr[((WordEntryIN *) b)->entry.pos],
! 					   ((WordEntryIN *) a)->entry.len);
  	}
  
! 	return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
  }
  
  static int
--- 75,94 ----
  }
  
  static int
! compareentry(const void *va, const void *vb, void *arg)
  {
  	char	   *BufferStr = (char *) arg;
+ 	WordEntryIN *a = (WordEntryIN *) va;
+ 	WordEntryIN *b = (WordEntryIN *) vb;
  
! 	if (a->entry.len == b->entry.len)
  	{
! 		return strncmp(&BufferStr[a->entry.pos],
! 					   &BufferStr[b->entry.pos],
! 					   a->entry.len);
  	}
  
! 	return (a->entry.len > b->entry.len) ? 1 : -1;
  }
  
  static int
***************
*** 104,109 ****
--- 106,114 ----
  			a->poslen = uniquePos(a->pos, a->poslen);
  			*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
  		}
+ 		else
+ 			*outbuflen = a->entry.len;
+ 
  		return l;
  	}
  	res = a;
***************
*** 118,127 ****
  		{
  			if (res->entry.haspos)
  			{
  				res->poslen = uniquePos(res->pos, res->poslen);
  				*outbuflen += res->poslen * sizeof(WordEntryPos);
  			}
! 			*outbuflen += SHORTALIGN(res->entry.len);
  			res++;
  			memcpy(res, ptr, sizeof(WordEntryIN));
  		}
--- 123,134 ----
  		{
  			if (res->entry.haspos)
  			{
+ 				*outbuflen += SHORTALIGN(res->entry.len);
  				res->poslen = uniquePos(res->pos, res->poslen);
  				*outbuflen += res->poslen * sizeof(WordEntryPos);
  			}
! 			else
! 				*outbuflen += res->entry.len;
  			res++;
  			memcpy(res, ptr, sizeof(WordEntryIN));
  		}
***************
*** 147,158 ****
  		}
  		ptr++;
  	}
  	if (res->entry.haspos)
  	{
  		res->poslen = uniquePos(res->pos, res->poslen);
  		*outbuflen += res->poslen * sizeof(WordEntryPos);
  	}
! 	*outbuflen += SHORTALIGN(res->entry.len);
  
  	return res + 1 - a;
  }
--- 154,171 ----
  		}
  		ptr++;
  	}
+ 
+ 	/* add last item */
+ 
  	if (res->entry.haspos)
  	{
+ 		*outbuflen += SHORTALIGN(res->entry.len);
+ 
  		res->poslen = uniquePos(res->pos, res->poslen);
  		*outbuflen += res->poslen * sizeof(WordEntryPos);
  	}
! 	else
! 		*outbuflen += res->entry.len;
  
  	return res + 1 - a;
  }
***************
*** 367,372 ****
--- 380,397 ----
  	PG_RETURN_CSTRING(outbuf);
  }
  
+ /*
+  * Binary Input / Output functions. The binary format is as follows:
+  *
+  * uint32	number of lexemes
+  * 
+  * for each lexeme:
+  *		lexeme text in client encoding, null-terminated
+  * 		uint16	number of positions
+  * 		for each position:
+  *			uint16 WordEntryPos
+  */
+ 
  Datum
  tsvectorsend(PG_FUNCTION_ARGS)
  {
***************
*** 381,398 ****
  	pq_sendint(&buf, vec->size, sizeof(int32));
  	for (i = 0; i < vec->size; i++)
  	{
! 		/*
! 		 * We are sure that sizeof(WordEntry) == sizeof(int32)
  		 */
! 		pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
  
! 		pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
! 		if (weptr->haspos)
  		{
  			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
  
! 			pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
! 			for (j = 0; j < POSDATALEN(vec, weptr); j++)
  				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
  		}
  		weptr++;
--- 406,427 ----
  	pq_sendint(&buf, vec->size, sizeof(int32));
  	for (i = 0; i < vec->size; i++)
  	{
! 		uint16 npos;
! 
! 		/* the strings in the TSVector array are not null-terminated, so 
! 		 * we have to send the null-terminator separately
  		 */
! 		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
! 		pq_sendbyte(&buf, '\0');
! 
! 		npos = POSDATALEN(vec, weptr);
! 		pq_sendint(&buf, npos, sizeof(uint16));
  
! 		if(npos > 0)
  		{
  			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
  
! 			for (j = 0; j < npos; j++)
  				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
  		}
  		weptr++;
***************
*** 407,477 ****
  	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
  	TSVector	vec;
  	int			i;
! 	uint32		size;
! 	WordEntry  *weptr;
! 	int			datalen = 0;
! 	Size		len;
  
! 	size = pq_getmsgint(buf, sizeof(uint32));
! 	if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
  		elog(ERROR, "invalid size of tsvector");
  
! 	len = DATAHDRSIZE + sizeof(WordEntry) * size;
  
! 	len = len * 2; /* times two to make room for lexemes */
  	vec = (TSVector) palloc0(len);
! 	vec->size = size;
  
! 	weptr = ARRPTR(vec);
! 	for (i = 0; i < size; i++)
  	{
! 		int32 tmp;
  
! 		weptr = ARRPTR(vec) + i;
  
  		/*
! 		 * We are sure that sizeof(WordEntry) == sizeof(int32)
  		 */
! 		tmp = pq_getmsgint(buf, sizeof(int32));
! 		*weptr = *(WordEntry *) & tmp;
! 
! 		while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
  		{
  			len *= 2;
  			vec = (TSVector) repalloc(vec, len);
- 			weptr = ARRPTR(vec) + i;
  		}
  
! 		memcpy(STRPTR(vec) + weptr->pos,
! 			   pq_getmsgbytes(buf, weptr->len),
! 			   weptr->len);
! 		datalen += SHORTALIGN(weptr->len);
  
! 		if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
  			elog(ERROR, "lexemes are unordered");
  
! 		if (weptr->haspos)
  		{
! 			uint16		j,
! 						npos;
  			WordEntryPos *wepptr;
  
! 			npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
! 			if (npos > MAXNUMPOS)
! 				elog(ERROR, "unexpected number of positions");
! 
! 			while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
  			{
! 				len *= 2;
! 				vec = (TSVector) repalloc(vec, len);
! 				weptr = ARRPTR(vec) + i;
  			}
  
! 			memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
! 			wepptr = POSDATAPTR(vec, weptr);
  			for (j = 0; j < npos; j++)
  			{
! 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
  				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
  					elog(ERROR, "position information is unordered");
  			}
--- 436,527 ----
  	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
  	TSVector	vec;
  	int			i;
! 	int32		nentries;
! 	int			datalen; /* number of bytes used in the variable size area
! 						  * after fixed size TSVector header and WordEntries
! 						  */
! 	Size		hdrlen;
! 	Size		len;  /* allocated size of vec */
  
! 	nentries = pq_getmsgint(buf, sizeof(int32));
! 	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
  		elog(ERROR, "invalid size of tsvector");
  
! 	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
  
! 	len = hdrlen * 2; /* times two to make room for lexemes */
  	vec = (TSVector) palloc0(len);
! 	vec->size = nentries;
  
! 	datalen = 0;
! 	for (i = 0; i < nentries; i++)
  	{
! 		const char *lexeme;
! 		uint16 npos;
! 		size_t lex_len;
! 
! 		lexeme = pq_getmsgstring(buf);
! 		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
! 
! 		/* sanity checks */
  
! 		lex_len = strlen(lexeme);
! 		if (lex_len < 0 || lex_len > MAXSTRLEN)
! 			elog(ERROR, "invalid tsvector; lexeme too long");
! 
! 		if (datalen > MAXSTRPOS)
! 			elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); 
! 
! 		if (npos > MAXNUMPOS)
! 			elog(ERROR, "unexpected number of positions");
  
  		/*
! 		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
! 		 *
! 		 * But make sure the buffer is large enough first.
  		 */
! 		while (hdrlen + SHORTALIGN(datalen + lex_len) +
! 			   (npos + 1) * sizeof(WordEntryPos) >= len)
  		{
  			len *= 2;
  			vec = (TSVector) repalloc(vec, len);
  		}
  
! 		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
! 		vec->entries[i].len = lex_len;
! 		vec->entries[i].pos = datalen;
  
! 		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
! 
! 		datalen += lex_len;
! 
! 		if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
  			elog(ERROR, "lexemes are unordered");
  
! 		/* Receive positions */
! 
! 		if (npos > 0)
  		{
! 			uint16		j;
  			WordEntryPos *wepptr;
  
! 			/*
! 			 * Pad to 2-byte alignment if necessary. Though we used palloc0
! 			 * for the initial allocation, subsequent repalloc'd memory
! 			 * areas are not initialized to zero.
! 			 */
! 			if (datalen != SHORTALIGN(datalen))
  			{
! 				*(STRPTR(vec) + datalen) = '\0';
! 				datalen = SHORTALIGN(datalen);
  			}
  
! 			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
! 
! 			wepptr = POSDATAPTR(vec, &vec->entries[i]);
  			for (j = 0; j < npos; j++)
  			{
! 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
  				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
  					elog(ERROR, "position information is unordered");
  			}
***************
*** 480,486 ****
  		}
  	}
  
! 	SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
  
  	PG_RETURN_TSVECTOR(vec);
  }
--- 530,536 ----
  		}
  	}
  
! 	SET_VARSIZE(vec, hdrlen + datalen);
  
  	PG_RETURN_TSVECTOR(vec);
  }
*** ../pgsql.tsearch-2/src/backend/utils/adt/tsvector_op.c	2007-08-31 21:20:00.000000000 +0100
--- ./src/backend/utils/adt/tsvector_op.c	2007-09-06 23:47:32.000000000 +0100
***************
*** 165,171 ****
  	char	   *cur;
  
  	for (i = 0; i < in->size; i++)
! 		len += SHORTALIGN(arrin[i].len);
  
  	len = CALCDATASIZE(in->size, len);
  	out = (TSVector) palloc0(len);
--- 165,171 ----
  	char	   *cur;
  
  	for (i = 0; i < in->size; i++)
! 		len += arrin[i].len;
  
  	len = CALCDATASIZE(in->size, len);
  	out = (TSVector) palloc0(len);
***************
*** 179,185 ****
  		arrout[i].haspos = 0;
  		arrout[i].len = arrin[i].len;
  		arrout[i].pos = cur - STRPTR(out);
! 		cur += SHORTALIGN(arrout[i].len);
  	}
  
  	PG_FREE_IF_COPY(in, 0);
--- 179,185 ----
  		arrout[i].haspos = 0;
  		arrout[i].len = arrin[i].len;
  		arrout[i].pos = cur - STRPTR(out);
! 		cur += arrout[i].len;
  	}
  
  	PG_FREE_IF_COPY(in, 0);
***************
*** 351,362 ****
  			ptr->len = ptr1->len;
  			memcpy(cur, data1 + ptr1->pos, ptr1->len);
  			ptr->pos = cur - data;
- 			cur += SHORTALIGN(ptr1->len);
  			if (ptr->haspos)
  			{
  				memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
  				cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
  			}
  			ptr++;
  			ptr1++;
  			i1--;
--- 351,365 ----
  			ptr->len = ptr1->len;
  			memcpy(cur, data1 + ptr1->pos, ptr1->len);
  			ptr->pos = cur - data;
  			if (ptr->haspos)
  			{
+ 				cur += SHORTALIGN(ptr1->len);
  				memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
  				cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
  			}
+ 			else
+ 				cur += ptr1->len;
+ 				
  			ptr++;
  			ptr1++;
  			i1--;
***************
*** 367,382 ****
  			ptr->len = ptr2->len;
  			memcpy(cur, data2 + ptr2->pos, ptr2->len);
  			ptr->pos = cur - data;
- 			cur += SHORTALIGN(ptr2->len);
  			if (ptr->haspos)
  			{
  				int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
  				if (addlen == 0)
  					ptr->haspos = 0;
  				else
  					cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
  			}
  			ptr++;
  			ptr2++;
  			i2--;
--- 370,389 ----
  			ptr->len = ptr2->len;
  			memcpy(cur, data2 + ptr2->pos, ptr2->len);
  			ptr->pos = cur - data;
  			if (ptr->haspos)
  			{
  				int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
+ 				cur += SHORTALIGN(ptr2->len);
+ 
  				if (addlen == 0)
  					ptr->haspos = 0;
  				else
  					cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
  			}
+ 			else
+ 				cur += ptr2->len;
+ 
  			ptr++;
  			ptr2++;
  			i2--;
***************
*** 387,395 ****
  			ptr->len = ptr1->len;
  			memcpy(cur, data1 + ptr1->pos, ptr1->len);
  			ptr->pos = cur - data;
- 			cur += SHORTALIGN(ptr1->len);
  			if (ptr->haspos)
  			{
  				if (ptr1->haspos)
  				{
  					memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
--- 394,402 ----
  			ptr->len = ptr1->len;
  			memcpy(cur, data1 + ptr1->pos, ptr1->len);
  			ptr->pos = cur - data;
  			if (ptr->haspos)
  			{
+ 				cur += SHORTALIGN(ptr1->len);
  				if (ptr1->haspos)
  				{
  					memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
***************
*** 407,412 ****
--- 414,422 ----
  						cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
  				}
  			}
+ 			else
+ 				cur += ptr1->len;
+ 
  			ptr++;
  			ptr1++;
  			ptr2++;
***************
*** 421,432 ****
  		ptr->len = ptr1->len;
  		memcpy(cur, data1 + ptr1->pos, ptr1->len);
  		ptr->pos = cur - data;
- 		cur += SHORTALIGN(ptr1->len);
  		if (ptr->haspos)
  		{
  			memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
  			cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
  		}
  		ptr++;
  		ptr1++;
  		i1--;
--- 431,445 ----
  		ptr->len = ptr1->len;
  		memcpy(cur, data1 + ptr1->pos, ptr1->len);
  		ptr->pos = cur - data;
  		if (ptr->haspos)
  		{
+ 			cur += SHORTALIGN(ptr1->len);
  			memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
  			cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
  		}
+ 		else
+ 			cur += ptr1->len;
+ 
  		ptr++;
  		ptr1++;
  		i1--;
***************
*** 438,453 ****
  		ptr->len = ptr2->len;
  		memcpy(cur, data2 + ptr2->pos, ptr2->len);
  		ptr->pos = cur - data;
- 		cur += SHORTALIGN(ptr2->len);
  		if (ptr->haspos)
  		{
  			int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
  			if (addlen == 0)
  				ptr->haspos = 0;
  			else
  				cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
  		}
  		ptr++;
  		ptr2++;
  		i2--;
--- 451,470 ----
  		ptr->len = ptr2->len;
  		memcpy(cur, data2 + ptr2->pos, ptr2->len);
  		ptr->pos = cur - data;
  		if (ptr->haspos)
  		{
  			int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
  
+ 			cur += SHORTALIGN(ptr2->len);
+ 
  			if (addlen == 0)
  				ptr->haspos = 0;
  			else
  				cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
  		}
+ 		else
+ 			cur += ptr2->len;
+ 
  		ptr++;
  		ptr2++;
  		i2--;
***************
*** 484,491 ****
  static bool
  checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item)
  {
! 	WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16));
! 	uint16		len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len)));
  
  	while (len--)
  	{
--- 501,508 ----
  static bool
  checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item)
  {
! 	WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16));
! 	uint16		len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len)));
  
  	while (len--)
  	{
*** ../pgsql.tsearch-2/src/include/tsearch/ts_type.h	2007-09-05 12:17:02.000000000 +0100
--- ./src/include/tsearch/ts_type.h	2007-09-07 09:20:43.000000000 +0100
***************
*** 62,87 ****
   *							bytes from end of WordEntry array to start of
   *							corresponding lexeme.
   * 4) Lexeme's storage:
!  *	  SHORTALIGNED(lexeme) and position information if it exists
!  *	  Position information: first int2 - is a number of positions and it
!  *	  follows array of WordEntryPos
   */
  
  typedef struct
  {
  	int32		vl_len_;		/* varlena header (do not touch directly!) */
! 	uint32		size;
! 	char		data[1];
  } TSVectorData;
  
  typedef TSVectorData *TSVector;
  
! #define DATAHDRSIZE (VARHDRSZ + sizeof(int4))
! #define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) )
! #define ARRPTR(x)	( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) )
! #define STRPTR(x)	( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
! #define STRSIZE(x)	( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
! #define _POSDATAPTR(x,e)	(STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len))
  #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 )
  #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) )
  
--- 62,94 ----
   *							bytes from end of WordEntry array to start of
   *							corresponding lexeme.
   * 4) Lexeme's storage:
!  *	  lexeme (without null-terminator)
!  *    if haspos is true:
!  *		padding byte if necessary to make the number of positions 2-byte aligned
!  *		uint16		number of positions that follow.
!  *		uint16[]	positions
!  *
!  * The positions must be sorted.
   */
  
  typedef struct
  {
  	int32		vl_len_;		/* varlena header (do not touch directly!) */
! 	int32		size;
! 	WordEntry	entries[1]; /* var size */
! 	/* lexemes follow */
  } TSVectorData;
  
  typedef TSVectorData *TSVector;
  
! #define DATAHDRSIZE (offsetof(TSVectorData, entries))
! #define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) )
! #define ARRPTR(x)	( (x)->entries )
! 
! /* returns a pointer to the beginning of lexemes */
! #define STRPTR(x)	( (char *) &(x)->entries[x->size] )
! 
! #define _POSDATAPTR(x,e)	(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))
  #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 )
  #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) )
  
***************
*** 166,172 ****
  	 * C: 1<<1
  	 * D: 1<<0
  	 */
! 	int8		weight;
  	int32	valcrc;				/* XXX: pg_crc32 would be a more appropriate data type, but we use comparisons to signed integers in the code. They would need to be changed as well. */
  
  	/* pointer to text value of operand, must correlate with WordEntry */
--- 173,179 ----
  	 * C: 1<<1
  	 * D: 1<<0
  	 */
! 	uint8		weight;
  	int32	valcrc;				/* XXX: pg_crc32 would be a more appropriate data type, but we use comparisons to signed integers in the code. They would need to be changed as well. */
  
  	/* pointer to text value of operand, must correlate with WordEntry */
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend

Reply via email to