[HACKERS] Remove 1MB size limit in tsvector

Ildus Kurbangaliev Tue, 01 Aug 2017 07:10:11 -0700

Hello, hackers!

Historically tsvector type can't hold more than 1MB data.
I want to propose a patch that removes that limit.


That limit is created by 'pos' field from WordEntry, which have only
20 bits for storage.

In the proposed patch I removed this field and instead of it I keep
offsets only at each Nth item in WordEntry's array. Now I set N as 4,
because it gave best results in my benchmarks. It can be increased in
the future without affecting already saved data in database. Also
removing the field improves compression of tsvectors.

I simplified the code by creating functions that can be used to
build tsvectors. There were duplicated code fragments in places where
tsvector was built.

Also new patch frees some space in WordEntry that can be used to
save some additional information about saved words.

- 
---
Ildus Kurbangaliev
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company

diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index 34fe4c5b3c..9585a25003 100644
--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -26,7 +26,7 @@ DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES))
 OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
 	dict_simple.o dict_synonym.o dict_thesaurus.o \
 	dict_ispell.o regis.o spell.o \
-	to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o
+	to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o ts_compat.o
 
 include $(top_srcdir)/src/backend/common.mk
 
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 35d9ab276c..d66b69baea 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -156,13 +156,10 @@ TSVector
 make_tsvector(ParsedText *prs)
 {
 	int			i,
-				j,
 				lenstr = 0,
-				totallen;
+				totallen,
+				stroff = 0;
 	TSVector	in;
-	WordEntry  *ptr;
-	char	   *str;
-	int			stroff;
 
 	/* Merge duplicate words */
 	if (prs->curwords > 0)
@@ -171,12 +168,8 @@ make_tsvector(ParsedText *prs)
 	/* Determine space needed */
 	for (i = 0; i < prs->curwords; i++)
 	{
-		lenstr += prs->words[i].len;
-		if (prs->words[i].alen)
-		{
-			lenstr = SHORTALIGN(lenstr);
-			lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
-		}
+		int npos = prs->words[i].alen ? prs->words[i].pos.apos[0] : 0;
+		INCRSIZE(lenstr, i, prs->words[i].len, npos);
 	}
 
 	if (lenstr > MAXSTRPOS)
@@ -187,41 +180,21 @@ make_tsvector(ParsedText *prs)
 	totallen = CALCDATASIZE(prs->curwords, lenstr);
 	in = (TSVector) palloc0(totallen);
 	SET_VARSIZE(in, totallen);
-	in->size = prs->curwords;
+	TS_SETCOUNT(in, prs->curwords);
 
-	ptr = ARRPTR(in);
-	str = STRPTR(in);
-	stroff = 0;
 	for (i = 0; i < prs->curwords; i++)
 	{
-		ptr->len = prs->words[i].len;
-		ptr->pos = stroff;
-		memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
-		stroff += prs->words[i].len;
-		pfree(prs->words[i].word);
+		int npos = 0;
 		if (prs->words[i].alen)
-		{
-			int			k = prs->words[i].pos.apos[0];
-			WordEntryPos *wptr;
+			npos = prs->words[i].pos.apos[0];
 
-			if (k > 0xFFFF)
-				elog(ERROR, "positions array too long");
+		tsvector_addlexeme(in, i, &stroff, prs->words[i].word, prs->words[i].len,
+			prs->words[i].pos.apos + 1, npos);
 
-			ptr->haspos = 1;
-			stroff = SHORTALIGN(stroff);
-			*(uint16 *) (str + stroff) = (uint16) k;
-			wptr = POSDATAPTR(in, ptr);
-			for (j = 0; j < k; j++)
-			{
-				WEP_SETWEIGHT(wptr[j], 0);
-				WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
-			}
-			stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
+		pfree(prs->words[i].word);
+		if (prs->words[i].alen)
 			pfree(prs->words[i].pos.apos);
-		}
-		else
-			ptr->haspos = 0;
-		ptr++;
+
 	}
 
 	if (prs->words)
@@ -251,7 +224,6 @@ to_tsvector_byid(PG_FUNCTION_ARGS)
 	PG_FREE_IF_COPY(in, 1);
 
 	out = make_tsvector(&prs);
-
 	PG_RETURN_TSVECTOR(out);
 }
 
diff --git a/src/backend/tsearch/ts_compat.c b/src/backend/tsearch/ts_compat.c
new file mode 100644
index 0000000000..bb2a62eaf7
--- /dev/null
+++ b/src/backend/tsearch/ts_compat.c
@@ -0,0 +1,83 @@
+#include "postgres.h"
+#include "tsearch/ts_type.h"
+
+/*
+ * Definition of old WordEntry struct in TSVector. Because of limitations
+ * in size (max 1MB for lexemes), the format has changed
+ */
+typedef struct
+{
+	uint32
+				haspos:1,
+				len:11,
+				pos:20;
+} OldWordEntry;
+
+typedef struct
+{
+	uint16		npos;
+	WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
+} OldWordEntryPosVector;
+
+#define OLDSTRPTR(x)	( (char *) &(x)->entries[x->size_] )
+#define _OLDPOSVECPTR(x, e)	\
+	((OldWordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
+#define OLDPOSDATALEN(x,e) ( ( (e)->haspos ) ? (_OLDPOSVECPTR(x,e)->npos) : 0 )
+#define OLDPOSDATAPTR(x,e) (_OLDPOSVECPTR(x,e)->pos)
+
+/*
+ * Converts tsvector with the old structure to current.
+ * @orig - tsvector to convert,
+ * @copy - return copy of tsvector, it has a meaning when tsvector doensn't
+ * need to be converted.
+ */
+TSVector
+tsvector_upgrade(Datum orig, bool copy)
+{
+	int	       i,
+	           dataoff = 0,
+			   datalen = 0,
+			   totallen;
+	TSVector   in,
+			   out;
+
+	in = (TSVector) PG_DETOAST_DATUM(orig);
+
+	/* If already in new format, return as is */
+	if (in->size_ & TS_FLAG_STRETCHED)
+	{
+		TSVector out;
+
+		if (!copy)
+			return in;
+
+		out = (TSVector) palloc(VARSIZE(in));
+		memcpy(out, in, VARSIZE(in));
+		return out;
+	}
+
+	/*
+	 * Calculate required size.
+	 * We don't check any sizes here because old format was limited with 1MB
+	 */
+	for (i = 0; i < in->size_; i++)
+	{
+		OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+		INCRSIZE(datalen, i, entry->len, OLDPOSDATALEN(in, entry));
+	}
+
+	totallen = CALCDATASIZE(in->size_, datalen);
+	out = (TSVector) palloc0(totallen);
+	SET_VARSIZE(out, totallen);
+	TS_SETCOUNT(out, in->size_);
+
+	for (i = 0; i < in->size_; i++)
+	{
+		OldWordEntry *entry = (OldWordEntry *) (in->entries + i);
+		tsvector_addlexeme(out, i, &dataoff,
+				OLDSTRPTR(in) + entry->pos, entry->len,
+				OLDPOSDATAPTR(in, entry), OLDPOSDATALEN(in, entry));
+	}
+
+	return out;
+}
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 320c7f1a61..9b2fc4be04 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -202,7 +202,8 @@ compute_tsvector_stats(VacAttrStats *stats,
 		TSVector	vector;
 		WordEntry  *curentryptr;
 		char	   *lexemesptr;
-		int			j;
+		int			j,
+					pos;
 
 		vacuum_delay_point();
 
@@ -236,7 +237,9 @@ compute_tsvector_stats(VacAttrStats *stats,
 		 */
 		lexemesptr = STRPTR(vector);
 		curentryptr = ARRPTR(vector);
-		for (j = 0; j < vector->size; j++)
+
+		INITPOS(pos);
+		for (j = 0; j < TS_COUNT(vector); j++)
 		{
 			bool		found;
 
@@ -246,8 +249,8 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * make a copy of it.  This way we can free the tsvector value
 			 * once we've processed all its lexemes.
 			 */
-			hash_key.lexeme = lexemesptr + curentryptr->pos;
-			hash_key.length = curentryptr->len;
+			hash_key.lexeme = lexemesptr + pos;
+			hash_key.length = ENTRY_LEN(vector, curentryptr);
 
 			/* Lookup current lexeme in hashtable, adding it if new */
 			item = (TrackItem *) hash_search(lexemes_tab,
@@ -280,7 +283,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 			}
 
 			/* Advance to the next WordEntry in the tsvector */
-			curentryptr++;
+			INCRPTR(vector, curentryptr, pos);
 		}
 
 		/* If the vector was toasted, free the detoasted copy. */
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 83a939dfd5..fc39dfcb3f 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -67,23 +67,26 @@ gin_extract_tsvector(PG_FUNCTION_ARGS)
 	TSVector	vector = PG_GETARG_TSVECTOR(0);
 	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
 	Datum	   *entries = NULL;
+	int			tscount = TS_COUNT(vector);
 
-	*nentries = vector->size;
-	if (vector->size > 0)
+	*nentries = tscount;
+	if (tscount > 0)
 	{
 		int			i;
-		WordEntry  *we = ARRPTR(vector);
+		uint32		pos;
 
-		entries = (Datum *) palloc(sizeof(Datum) * vector->size);
+		WordEntry  *we = ARRPTR(vector);
+		entries = (Datum *) palloc(sizeof(Datum) * tscount);
 
-		for (i = 0; i < vector->size; i++)
+		INITPOS(pos);
+		for (i = 0; i < tscount; i++)
 		{
 			text	   *txt;
 
-			txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len);
+			txt = cstring_to_text_with_len(STRPTR(vector) + pos,
+				ENTRY_LEN(vector, we));
 			entries[i] = PointerGetDatum(txt);
-
-			we++;
+			INCRPTR(vector, we, pos);
 		}
 	}
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 7ce2699b5c..18d3de3725 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -192,28 +192,33 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 		int32	   *arr;
 		WordEntry  *ptr = ARRPTR(val);
 		char	   *words = STRPTR(val);
+		const int	tscount = TS_COUNT(val);
+		uint32		pos;
 
-		len = CALCGTSIZE(ARRKEY, val->size);
+		len = CALCGTSIZE(ARRKEY, tscount);
 		res = (SignTSVector *) palloc(len);
 		SET_VARSIZE(res, len);
 		res->flag = ARRKEY;
 		arr = GETARR(res);
-		len = val->size;
+		len = tscount;
+
+		INITPOS(pos);
 		while (len--)
 		{
 			pg_crc32	c;
 
 			INIT_LEGACY_CRC32(c);
-			COMP_LEGACY_CRC32(c, words + ptr->pos, ptr->len);
+			COMP_LEGACY_CRC32(c, words + pos, ENTRY_LEN(val, ptr));
 			FIN_LEGACY_CRC32(c);
 
 			*arr = *(int32 *) &c;
 			arr++;
-			ptr++;
+
+			INCRPTR(val, ptr, pos);
 		}
 
-		len = uniqueint(GETARR(res), val->size);
-		if (len != val->size)
+		len = uniqueint(GETARR(res), tscount);
+		if (len != tscount)
 		{
 			/*
 			 * there is a collision of hash-function; len is always less than
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 4577bcc0b8..26252ca353 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -53,43 +53,38 @@ word_distance(int32 w)
 static int
 cnt_length(TSVector t)
 {
-	WordEntry  *ptr = ARRPTR(t),
-			   *end = (WordEntry *) STRPTR(t);
-	int			len = 0;
+	int		i,
+			len = 0;
 
-	while (ptr < end)
+	for (i = 0; i < TS_COUNT(t); i++)
 	{
-		int			clen = POSDATALEN(t, ptr);
-
-		if (clen == 0)
-			len += 1;
-		else
-			len += clen;
-
-		ptr++;
+		WordEntry *entry = UNWRAP_ENTRY(t, ARRPTR(t) + i);
+		Assert(!entry->hasoff);
+		len += (entry->npos == 0) ? 1 : entry->npos;
 	}
 
 	return len;
 }
 
 
-#define WordECompareQueryItem(e,q,p,i,m) \
-	tsCompareString((q) + (i)->distance, (i)->length,	\
-					(e) + (p)->pos, (p)->len, (m))
-
-
 /*
  * Returns a pointer to a WordEntry's array corresponding to 'item' from
  * tsvector 't'. 'q' is the TSQuery containing 'item'.
  * Returns NULL if not found.
  */
-static WordEntry *
+static int
 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
 {
-	WordEntry  *StopLow = ARRPTR(t);
-	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
-	WordEntry  *StopMiddle = StopHigh;
-	int			difference;
+#define WordECompareQueryItem(s,l,q,i,m) \
+	tsCompareString((q) + (i)->distance, (i)->length,	\
+					s, l, (m))
+
+	int StopLow = 0;
+	int StopHigh = TS_COUNT(t);
+	int StopMiddle = StopHigh;
+	int	difference;
+	char *lexeme;
+	WordEntry *we;
 
 	*nitem = 0;
 
@@ -97,7 +92,12 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
-		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
+		lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+		Assert(!we->hasoff);
+		difference = WordECompareQueryItem(lexeme, we->len,
+			GETOPERAND(q), item, false);
+
 		if (difference == 0)
 		{
 			StopHigh = StopMiddle;
@@ -117,18 +117,22 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
 
 		*nitem = 0;
 
-		while (StopMiddle < (WordEntry *) STRPTR(t) &&
-			   WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
+		while (StopMiddle < TS_COUNT(t))
 		{
+			lexeme = tsvector_getlexeme(t, StopMiddle, &we);
+
+			Assert(!we->hasoff);
+			if (WordECompareQueryItem(lexeme, we->len, GETOPERAND(q), item, true) != 0)
+				break;
+
 			(*nitem)++;
 			StopMiddle++;
 		}
 	}
 
-	return (*nitem > 0) ? StopHigh : NULL;
+	return (*nitem > 0) ? StopHigh : -1;
 }
 
-
 /*
  * sort QueryOperands by (length, word)
  */
@@ -200,15 +204,13 @@ SortAndUniqItems(TSQuery q, int *size)
 static float
 calc_rank_and(const float *w, TSVector t, TSQuery q)
 {
-	WordEntryPosVector **pos;
-	WordEntryPosVector1 posnull;
-	WordEntryPosVector *POSNULL;
+	WordEntryPos **pos;
+	uint16 *npos;
+	WordEntryPos posnull[1] = {0};
 	int			i,
 				k,
 				l,
 				p;
-	WordEntry  *entry,
-			   *firstentry;
 	WordEntryPos *post,
 			   *ct;
 	int32		dimt,
@@ -225,41 +227,55 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
 		pfree(item);
 		return calc_rank_or(w, t, q);
 	}
-	pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
+	pos = (WordEntryPos **) palloc0(sizeof(WordEntryPos *) * q->size);
+	npos = (uint16 *) palloc0(sizeof(uint16) * q->size);
 
-	/* A dummy WordEntryPos array to use when haspos is false */
-	posnull.npos = 1;
-	posnull.pos[0] = 0;
-	WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
-	POSNULL = (WordEntryPosVector *) &posnull;
+	/* posnull is a dummy WordEntryPos array to use when npos == 0 */
+	WEP_SETPOS(posnull[0], MAXENTRYPOS - 1);
 
 	for (i = 0; i < size; i++)
 	{
-		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
-		if (!entry)
+		int   idx = find_wordentry(t, q, item[i], &nitem),
+			  firstidx;
+
+		if (idx == -1)
 			continue;
 
-		while (entry - firstentry < nitem)
+		firstidx = idx;
+
+		while (idx - firstidx < nitem)
 		{
-			if (entry->haspos)
-				pos[i] = _POSVECPTR(t, entry);
+			WordEntry *entry;
+
+			char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+			Assert(!entry->hasoff);
+			if (entry->npos)
+			{
+				pos[i] = POSDATAPTR(lexeme, entry->len);
+				npos[i] = entry->npos;
+			}
 			else
-				pos[i] = POSNULL;
+			{
+				pos[i] = posnull;
+				npos[i] = 1;
+			}
+
+			post = pos[i];
+			dimt = npos[i];
 
-			dimt = pos[i]->npos;
-			post = pos[i]->pos;
 			for (k = 0; k < i; k++)
 			{
 				if (!pos[k])
 					continue;
-				lenct = pos[k]->npos;
-				ct = pos[k]->pos;
+				lenct = npos[k];
+				ct = pos[k];
 				for (l = 0; l < dimt; l++)
 				{
 					for (p = 0; p < lenct; p++)
 					{
 						dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
-						if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
+						if (dist || (dist == 0 && (pos[i] == posnull || pos[k] == posnull)))
 						{
 							float		curw;
 
@@ -272,10 +288,11 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
 				}
 			}
 
-			entry++;
+			idx++;
 		}
 	}
 	pfree(pos);
+	pfree(npos);
 	pfree(item);
 	return res;
 }
@@ -283,9 +300,8 @@ calc_rank_and(const float *w, TSVector t, TSQuery q)
 static float
 calc_rank_or(const float *w, TSVector t, TSQuery q)
 {
-	WordEntry  *entry,
-			   *firstentry;
-	WordEntryPosVector1 posnull;
+	/* A dummy WordEntryPos array to use when lexeme hasn't positions */
+	WordEntryPos posnull[1] = {0};
 	WordEntryPos *post;
 	int32		dimt,
 				j,
@@ -295,33 +311,36 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
 	QueryOperand **item;
 	int			size = q->size;
 
-	/* A dummy WordEntryPos array to use when haspos is false */
-	posnull.npos = 1;
-	posnull.pos[0] = 0;
-
 	item = SortAndUniqItems(q, &size);
 
 	for (i = 0; i < size; i++)
 	{
+		int         idx  , firstidx;
 		float		resj,
 					wjm;
-		int32		jm;
+		int32       jm;
 
-		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
-		if (!entry)
+		idx = find_wordentry(t, q, item[i], &nitem);
+		if (idx == -1)
 			continue;
 
-		while (entry - firstentry < nitem)
+		firstidx = idx;
+
+		while (idx - firstidx < nitem)
 		{
-			if (entry->haspos)
+			WordEntry *entry;
+			char *lexeme = tsvector_getlexeme(t, idx, &entry);
+
+			Assert(!entry->hasoff);
+			if (entry->npos)
 			{
-				dimt = POSDATALEN(t, entry);
-				post = POSDATAPTR(t, entry);
+				dimt = entry->npos;
+				post = POSDATAPTR(lexeme, entry->len);
 			}
 			else
 			{
-				dimt = posnull.npos;
-				post = posnull.pos;
+				dimt = 1;
+				post = posnull;
 			}
 
 			resj = 0.0;
@@ -345,7 +364,7 @@ calc_rank_or(const float *w, TSVector t, TSQuery q)
 */
 			res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
 
-			entry++;
+			idx++;
 		}
 	}
 	if (size > 0)
@@ -361,7 +380,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
 	float		res = 0.0;
 	int			len;
 
-	if (!t->size || !q->size)
+	if (!TS_COUNT(t) || !q->size)
 		return 0.0;
 
 	/* XXX: What about NOT? */
@@ -373,7 +392,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
 	if (res < 0)
 		res = 1e-20f;
 
-	if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
+	if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(t) > 0)
 		res /= log((double) (cnt_length(t) + 1)) / log(2.0);
 
 	if (method & RANK_NORM_LENGTH)
@@ -385,11 +404,11 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
 
 	/* RANK_NORM_EXTDIST not applicable */
 
-	if ((method & RANK_NORM_UNIQ) && t->size > 0)
-		res /= (float) (t->size);
+	if ((method & RANK_NORM_UNIQ) && TS_COUNT(t) > 0)
+		res /= (float) (TS_COUNT(t));
 
-	if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
-		res /= log((double) (t->size + 1)) / log(2.0);
+	if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(t) > 0)
+		res /= log((double) (TS_COUNT(t) + 1)) / log(2.0);
 
 	if (method & RANK_NORM_RDIVRPLUS1)
 		res /= (res + 1);
@@ -504,13 +523,13 @@ typedef struct
 		struct
 		{						/* compiled doc representation */
 			QueryItem **items;
-			int16		nitem;
+			int32		nitem;
 		}			query;
 		struct
 		{						/* struct is used for preparing doc
 								 * representation */
 			QueryItem  *item;
-			WordEntry  *entry;
+			int32		idx;
 		}			map;
 	}			data;
 	WordEntryPos pos;
@@ -526,10 +545,10 @@ compareDocR(const void *va, const void *vb)
 	{
 		if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
 		{
-			if (a->data.map.entry == b->data.map.entry)
+			if (a->data.map.idx == b->data.map.idx)
 				return 0;
 
-			return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
+			return (a->data.map.idx > b->data.map.idx) ? 1 : -1;
 		}
 
 		return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
@@ -724,9 +743,6 @@ static DocRepresentation *
 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 {
 	QueryItem  *item = GETQUERY(qr->query);
-	WordEntry  *entry,
-			   *firstentry;
-	WordEntryPos *post;
 	int32		dimt,			/* number of 'post' items */
 				j,
 				i,
@@ -743,29 +759,38 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 	 */
 	for (i = 0; i < qr->query->size; i++)
 	{
-		QueryOperand *curoperand;
+		int				idx,
+						firstidx;
+		QueryOperand   *curoperand;
+		WordEntryPos   *post;
 
 		if (item[i].type != QI_VAL)
 			continue;
 
 		curoperand = &item[i].qoperand;
 
-		firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
-		if (!entry)
+		idx = find_wordentry(txt, qr->query, curoperand, &nitem);
+		if (idx < 0)
 			continue;
 
+		firstidx = idx;
+
 		/* iterations over entries in tsvector */
-		while (entry - firstentry < nitem)
+		while (idx - firstidx < nitem)
 		{
-			if (entry->haspos)
+			WordEntry	*entry;
+			char		*lex = tsvector_getlexeme(txt, idx, &entry);
+
+			Assert(!entry->hasoff);
+			if (entry->npos)
 			{
-				dimt = POSDATALEN(txt, entry);
-				post = POSDATAPTR(txt, entry);
+				dimt = entry->npos;
+				post = POSDATAPTR(lex, entry->len);
 			}
 			else
 			{
 				/* ignore words without positions */
-				entry++;
+				idx++;
 				continue;
 			}
 
@@ -782,13 +807,12 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 					curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
 				{
 					doc[cur].pos = post[j];
-					doc[cur].data.map.entry = entry;
+					doc[cur].data.map.idx = idx;
 					doc[cur].data.map.item = (QueryItem *) curoperand;
 					cur++;
 				}
 			}
-
-			entry++;
+			idx++;
 		}
 	}
 
@@ -814,7 +838,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 		while (rptr - doc < cur)
 		{
 			if (rptr->pos == (rptr - 1)->pos &&
-				rptr->data.map.entry == (rptr - 1)->data.map.entry)
+				rptr->data.map.idx == (rptr - 1)->data.map.idx)
 			{
 				storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
 				storage.data.query.nitem++;
@@ -917,7 +941,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
 		NExtent++;
 	}
 
-	if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
+	if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(txt) > 0)
 		Wdoc /= log((double) (cnt_length(txt) + 1));
 
 	if (method & RANK_NORM_LENGTH)
@@ -930,11 +954,11 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
 	if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
 		Wdoc /= ((double) NExtent) / SumDist;
 
-	if ((method & RANK_NORM_UNIQ) && txt->size > 0)
-		Wdoc /= (double) (txt->size);
+	if ((method & RANK_NORM_UNIQ) && TS_COUNT(txt) > 0)
+		Wdoc /= (double) (TS_COUNT(txt));
 
-	if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
-		Wdoc /= log((double) (txt->size + 1)) / log(2.0);
+	if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(txt) > 0)
+		Wdoc /= log((double) (TS_COUNT(txt) + 1)) / log(2.0);
 
 	if (method & RANK_NORM_RDIVRPLUS1)
 		Wdoc /= (Wdoc + 1);
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 6f66c1f58c..57f1de8d14 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -22,9 +22,9 @@
 
 typedef struct
 {
-	WordEntry	entry;			/* must be first! */
-	WordEntryPos *pos;
-	int			poslen;			/* number of elements in pos */
+	WordEntry		entry;			/* must be first! */
+	size_t			offset;			/* offset of lexeme in some buffer */
+	WordEntryPos   *pos;
 } WordEntryIN;
 
 
@@ -79,14 +79,30 @@ uniquePos(WordEntryPos *a, int l)
 
 /* Compare two WordEntryIN values for qsort */
 static int
-compareentry(const void *va, const void *vb, void *arg)
+compareentry_in(const void *va, const void *vb, void *arg)
 {
 	const WordEntryIN *a = (const WordEntryIN *) va;
 	const WordEntryIN *b = (const WordEntryIN *) vb;
 	char	   *BufferStr = (char *) arg;
 
-	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
-						   &BufferStr[b->entry.pos], b->entry.len,
+	return tsCompareString(&BufferStr[a->offset], a->entry.len,
+						   &BufferStr[b->offset], b->entry.len,
+						   false);
+}
+
+/* Compare two WordEntry values for qsort */
+static int
+compareentry(const void *va, const void *vb, void *arg)
+{
+	const WordEntry *a = (const WordEntry *) va;
+	const WordEntry *b = (const WordEntry *) vb;
+	TSVector	   tsv = (TSVector) arg;
+
+	uint32 offset1 = tsvector_getoffset(tsv, a - ARRPTR(tsv), NULL),
+		   offset2 = tsvector_getoffset(tsv, b - ARRPTR(tsv), NULL);
+
+	return tsCompareString(STRPTR(tsv) + offset1, ENTRY_LEN(tsv, a),
+						   STRPTR(tsv) + offset2, ENTRY_LEN(tsv, b),
 						   false);
 }
 
@@ -97,14 +113,15 @@ compareentry(const void *va, const void *vb, void *arg)
 static int
 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
 {
-	int			buflen;
+	int			buflen,
+				i = 0;
 	WordEntryIN *ptr,
 			   *res;
 
 	Assert(l >= 1);
 
 	if (l > 1)
-		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
+		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry_in,
 				  (void *) buf);
 
 	buflen = 0;
@@ -112,67 +129,76 @@ uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
 	ptr = a + 1;
 	while (ptr - a < l)
 	{
+		Assert(!ptr->entry.hasoff);
+
 		if (!(ptr->entry.len == res->entry.len &&
-			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
-					  res->entry.len) == 0))
+			  strncmp(&buf[ptr->offset], &buf[res->offset], res->entry.len) == 0))
 		{
 			/* done accumulating data into *res, count space needed */
+			buflen = SHORTALIGN(buflen);
+			if (i++ % TS_OFFSET_STRIDE == 0)
+			{
+				buflen = INTALIGN(buflen);
+				buflen += sizeof(WordEntry);
+			}
+
 			buflen += res->entry.len;
-			if (res->entry.haspos)
+			if (res->entry.npos)
 			{
-				res->poslen = uniquePos(res->pos, res->poslen);
+				res->entry.npos = uniquePos(res->pos, res->entry.npos);
 				buflen = SHORTALIGN(buflen);
-				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+				buflen += res->entry.npos * sizeof(WordEntryPos);
 			}
 			res++;
 			if (res != ptr)
-				memcpy(res, ptr, sizeof(WordEntryIN));
+				*res = *ptr;
 		}
-		else if (ptr->entry.haspos)
+		else if (ptr->entry.npos)
 		{
-			if (res->entry.haspos)
+			if (res->entry.npos)
 			{
 				/* append ptr's positions to res's positions */
-				int			newlen = ptr->poslen + res->poslen;
+				int newlen = ptr->entry.npos + res->entry.npos;
 
 				res->pos = (WordEntryPos *)
 					repalloc(res->pos, newlen * sizeof(WordEntryPos));
-				memcpy(&res->pos[res->poslen], ptr->pos,
-					   ptr->poslen * sizeof(WordEntryPos));
-				res->poslen = newlen;
+				memcpy(&res->pos[res->entry.npos], ptr->pos,
+					   ptr->entry.npos * sizeof(WordEntryPos));
+				res->entry.npos = newlen;
 				pfree(ptr->pos);
 			}
 			else
 			{
 				/* just give ptr's positions to pos */
-				res->entry.haspos = 1;
+				res->entry.npos = ptr->entry.npos;
 				res->pos = ptr->pos;
-				res->poslen = ptr->poslen;
 			}
 		}
 		ptr++;
 	}
 
 	/* count space needed for last item */
+	if (i % TS_OFFSET_STRIDE == 0)
+	{
+		buflen = INTALIGN(buflen);
+		buflen += sizeof(WordEntry);
+	}
+	else
+		buflen = SHORTALIGN(buflen);
+
 	buflen += res->entry.len;
-	if (res->entry.haspos)
+
+	if (res->entry.npos)
 	{
-		res->poslen = uniquePos(res->pos, res->poslen);
+		res->entry.npos = uniquePos(res->pos, res->entry.npos);
 		buflen = SHORTALIGN(buflen);
-		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
+		buflen += res->entry.npos * sizeof(WordEntryPos);
 	}
 
 	*outbuflen = buflen;
-	return res + 1 - a;
+	return res + 1 -a;
 }
 
-static int
-WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
-{
-	return compareentry(a, b, buf);
-}
-
-
 Datum
 tsvectorin(PG_FUNCTION_ARGS)
 {
@@ -181,7 +207,6 @@ tsvectorin(PG_FUNCTION_ARGS)
 	WordEntryIN *arr;
 	int			totallen;
 	int			arrlen;			/* allocated size of arr */
-	WordEntry  *inarr;
 	int			len = 0;
 	TSVector	in;
 	int			i;
@@ -189,7 +214,6 @@ tsvectorin(PG_FUNCTION_ARGS)
 	int			toklen;
 	WordEntryPos *pos;
 	int			poslen;
-	char	   *strbuf;
 	int			stroff;
 
 	/*
@@ -238,23 +262,13 @@ tsvectorin(PG_FUNCTION_ARGS)
 			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
 			cur = tmpbuf + dist;
 		}
+		arr[len].entry.hasoff = 0;
 		arr[len].entry.len = toklen;
-		arr[len].entry.pos = cur - tmpbuf;
+		arr[len].offset = cur - tmpbuf;
+		arr[len].entry.npos = poslen;
+		arr[len].pos = (poslen != 0)? pos : NULL;
 		memcpy((void *) cur, (void *) token, toklen);
 		cur += toklen;
-
-		if (poslen != 0)
-		{
-			arr[len].entry.haspos = 1;
-			arr[len].pos = pos;
-			arr[len].poslen = poslen;
-		}
-		else
-		{
-			arr[len].entry.haspos = 0;
-			arr[len].pos = NULL;
-			arr[len].poslen = 0;
-		}
 		len++;
 	}
 
@@ -273,36 +287,18 @@ tsvectorin(PG_FUNCTION_ARGS)
 	totallen = CALCDATASIZE(len, buflen);
 	in = (TSVector) palloc0(totallen);
 	SET_VARSIZE(in, totallen);
-	in->size = len;
-	inarr = ARRPTR(in);
-	strbuf = STRPTR(in);
+	TS_SETCOUNT(in, len);
 	stroff = 0;
 	for (i = 0; i < len; i++)
 	{
-		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
-		arr[i].entry.pos = stroff;
-		stroff += arr[i].entry.len;
-		if (arr[i].entry.haspos)
-		{
-			if (arr[i].poslen > 0xFFFF)
-				elog(ERROR, "positions array too long");
-
-			/* Copy number of positions */
-			stroff = SHORTALIGN(stroff);
-			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
-			stroff += sizeof(uint16);
-
-			/* Copy positions */
-			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
-			stroff += arr[i].poslen * sizeof(WordEntryPos);
+		tsvector_addlexeme(in, i, &stroff, &tmpbuf[arr[i].offset],
+				arr[i].entry.len, arr[i].pos, arr[i].entry.npos);
 
+		if (arr[i].entry.npos)
 			pfree(arr[i].pos);
-		}
-		inarr[i] = arr[i].entry;
 	}
 
-	Assert((strbuf + stroff - (char *) in) == totallen);
-
+	Assert((STRPTR(in) + stroff - (char *) in) == totallen);
 	PG_RETURN_TSVECTOR(in);
 }
 
@@ -313,28 +309,36 @@ tsvectorout(PG_FUNCTION_ARGS)
 	char	   *outbuf;
 	int32		i,
 				lenbuf = 0,
-				pp;
+				pp,
+				tscount = TS_COUNT(out);
+	uint32		pos;
 	WordEntry  *ptr = ARRPTR(out);
 	char	   *curbegin,
 			   *curin,
 			   *curout;
 
-	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
-	for (i = 0; i < out->size; i++)
+	lenbuf = tscount * 2 /* '' */ + tscount - 1 /* space */ + 2 /* \0 */ ;
+	for (i = 0; i < tscount; i++)
 	{
-		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
-		if (ptr[i].haspos)
-			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
+		int npos = ENTRY_NPOS(out, ptr + i);
+		lenbuf += ENTRY_LEN(out, ptr + i) * 2 * pg_database_encoding_max_length() /* for escape */ ;
+		if (npos)
+			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * npos;
 	}
 
 	curout = outbuf = (char *) palloc(lenbuf);
-	for (i = 0; i < out->size; i++)
+
+	INITPOS(pos);
+	for (i = 0; i < tscount; i++)
 	{
-		curbegin = curin = STRPTR(out) + ptr->pos;
+		int		lex_len = ENTRY_LEN(out, ptr),
+				npos = ENTRY_NPOS(out, ptr);
+
+		curbegin = curin = STRPTR(out) + pos;
 		if (i != 0)
 			*curout++ = ' ';
 		*curout++ = '\'';
-		while (curin - curbegin < ptr->len)
+		while (curin - curbegin < lex_len)
 		{
 			int			len = pg_mblen(curin);
 
@@ -348,12 +352,12 @@ tsvectorout(PG_FUNCTION_ARGS)
 		}
 
 		*curout++ = '\'';
-		if ((pp = POSDATALEN(out, ptr)) != 0)
+		if ((pp = npos) != 0)
 		{
 			WordEntryPos *wptr;
 
 			*curout++ = ':';
-			wptr = POSDATAPTR(out, ptr);
+			wptr = POSDATAPTR(curbegin, lex_len);
 			while (pp)
 			{
 				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
@@ -379,7 +383,8 @@ tsvectorout(PG_FUNCTION_ARGS)
 				wptr++;
 			}
 		}
-		ptr++;
+
+		INCRPTR(out, ptr, pos);
 	}
 
 	*curout = '\0';
@@ -406,35 +411,38 @@ tsvectorsend(PG_FUNCTION_ARGS)
 	StringInfoData buf;
 	int			i,
 				j;
+	uint32		pos;
 	WordEntry  *weptr = ARRPTR(vec);
 
 	pq_begintypsend(&buf);
+	pq_sendint(&buf, TS_COUNT(vec), sizeof(int32));
 
-	pq_sendint(&buf, vec->size, sizeof(int32));
-	for (i = 0; i < vec->size; i++)
+	INITPOS(pos);
+	for (i = 0; i < TS_COUNT(vec); i++)
 	{
-		uint16		npos;
+		char   *lexeme	= STRPTR(vec) + pos;
+		int		npos	= ENTRY_NPOS(vec, weptr),
+				lex_len = ENTRY_LEN(vec, weptr);
 
 		/*
 		 * the strings in the TSVector array are not null-terminated, so we
 		 * have to send the null-terminator separately
 		 */
-		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+		pq_sendtext(&buf, lexeme, lex_len);
 		pq_sendbyte(&buf, '\0');
-
-		npos = POSDATALEN(vec, weptr);
 		pq_sendint(&buf, npos, sizeof(uint16));
 
 		if (npos > 0)
 		{
-			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
+			WordEntryPos *wepptr = POSDATAPTR(lexeme, lex_len);
 
 			for (j = 0; j < npos; j++)
 				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
 		}
-		weptr++;
+		INCRPTR(vec, weptr, pos);
 	}
 
+	PG_FREE_IF_COPY(vec, 0);
 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
@@ -443,14 +451,16 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 {
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
 	TSVector	vec;
-	int			i;
-	int32		nentries;
-	int			datalen;		/* number of bytes used in the variable size
+	int			i,
+				datalen;		/* number of bytes used in the variable size
 								 * area after fixed size TSVector header and
 								 * WordEntries */
+	int32		nentries;
 	Size		hdrlen;
 	Size		len;			/* allocated size of vec */
 	bool		needSort = false;
+	char	   *prev_lexeme = NULL;
+	int			prev_lex_len;
 
 	nentries = pq_getmsgint(buf, sizeof(int32));
 	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
@@ -460,16 +470,17 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 
 	len = hdrlen * 2;			/* times two to make room for lexemes */
 	vec = (TSVector) palloc0(len);
-	vec->size = nentries;
+	TS_SETCOUNT(vec, nentries);
 
 	datalen = 0;
 	for (i = 0; i < nentries; i++)
 	{
-		const char *lexeme;
+		char	   *lexeme,
+				   *lexeme_out;
 		uint16		npos;
-		size_t		lex_len;
+		int			lex_len;
 
-		lexeme = pq_getmsgstring(buf);
+		lexeme = (char *) pq_getmsgstring(buf);
 		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
 
 		/* sanity checks */
@@ -489,62 +500,42 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 		 *
 		 * But make sure the buffer is large enough first.
 		 */
-		while (hdrlen + SHORTALIGN(datalen + lex_len) +
-			   (npos + 1) * sizeof(WordEntryPos) >= len)
+		while (hdrlen + SHORTALIGN(datalen + lex_len) + sizeof(WordEntry) +
+			   npos * sizeof(WordEntryPos) >= len)
 		{
 			len *= 2;
 			vec = (TSVector) repalloc(vec, len);
 		}
 
-		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
-		vec->entries[i].len = lex_len;
-		vec->entries[i].pos = datalen;
-
-		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
-
-		datalen += lex_len;
-
-		if (i > 0 && WordEntryCMP(&vec->entries[i],
-								  &vec->entries[i - 1],
-								  STRPTR(vec)) <= 0)
+		if (prev_lexeme && tsCompareString(lexeme, lex_len,
+						prev_lexeme, prev_lex_len, false) <= 0)
 			needSort = true;
 
-		/* Receive positions */
+		lexeme_out = tsvector_addlexeme(vec, i, &datalen, lexeme,
+				lex_len, NULL, npos);
 		if (npos > 0)
 		{
-			uint16		j;
-			WordEntryPos *wepptr;
-
-			/*
-			 * Pad to 2-byte alignment if necessary. Though we used palloc0
-			 * for the initial allocation, subsequent repalloc'd memory areas
-			 * are not initialized to zero.
-			 */
-			if (datalen != SHORTALIGN(datalen))
-			{
-				*(STRPTR(vec) + datalen) = '\0';
-				datalen = SHORTALIGN(datalen);
-			}
+			WordEntryPos   *wepptr;
+			int				j;
 
-			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
-
-			wepptr = POSDATAPTR(vec, &vec->entries[i]);
+			wepptr = POSDATAPTR(lexeme_out, lex_len);
 			for (j = 0; j < npos; j++)
 			{
 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
 				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
 					elog(ERROR, "position information is misordered");
 			}
-
-			datalen += (npos + 1) * sizeof(WordEntry);
 		}
+
+		prev_lexeme = lexeme;
+		prev_lex_len = lex_len;
 	}
 
 	SET_VARSIZE(vec, hdrlen + datalen);
 
 	if (needSort)
-		qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
-				  compareentry, (void *) STRPTR(vec));
+		qsort_arg((void *) ARRPTR(vec), TS_COUNT(vec), sizeof(WordEntry),
+				  compareentry, (void *) vec);
 
 	PG_RETURN_TSVECTOR(vec);
 }
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 822520299e..9f53aae357 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -33,10 +33,10 @@
 
 typedef struct
 {
-	WordEntry  *arrb;
-	WordEntry  *arre;
-	char	   *values;
-	char	   *operand;
+	TSVector			vec;
+	int					bidx;
+	int					eidx;
+	char			   *operand;
 } CHKVAL;
 
 
@@ -71,7 +71,7 @@ static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
 static int	tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
 
 /*
- * Order: haspos, len, word, for all positions (pos, weight)
+ * Order: npos, len, word, for all positions (pos, weight)
  */
 static int
 silly_cmp_tsvector(const TSVector a, const TSVector b)
@@ -80,9 +80,9 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
 		return -1;
 	else if (VARSIZE(a) > VARSIZE(b))
 		return 1;
-	else if (a->size < b->size)
+	else if (TS_COUNT(a) < TS_COUNT(b))
 		return -1;
-	else if (a->size > b->size)
+	else if (TS_COUNT(a) > TS_COUNT(b))
 		return 1;
 	else
 	{
@@ -90,28 +90,40 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
 		WordEntry  *bptr = ARRPTR(b);
 		int			i = 0;
 		int			res;
+		uint32		pos1,
+					pos2;
 
+		INITPOS(pos1);
+		INITPOS(pos2);
 
-		for (i = 0; i < a->size; i++)
+		for (i = 0; i < TS_COUNT(a); i++)
 		{
-			if (aptr->haspos != bptr->haspos)
-			{
-				return (aptr->haspos > bptr->haspos) ? -1 : 1;
-			}
-			else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
+			char   *lex1 = STRPTR(a) + pos1,
+				   *lex2 = STRPTR(b) + pos2;
+			int		npos1 = ENTRY_NPOS(a, aptr),
+					npos2 = ENTRY_NPOS(b, bptr);
+			int		len1 = ENTRY_LEN(a, aptr),
+					len2 = ENTRY_LEN(b, bptr);
+
+			if ((npos1 == 0 || npos2 == 0) && npos1 != npos2)
+				return npos1 > npos2? -1 : 1;
+			else if ((res = tsCompareString(lex1, len1, lex2, len2, false)) != 0)
 			{
 				return res;
 			}
-			else if (aptr->haspos)
+			else if (npos1 > 0)
 			{
-				WordEntryPos *ap = POSDATAPTR(a, aptr);
-				WordEntryPos *bp = POSDATAPTR(b, bptr);
+				WordEntryPos *ap,
+							 *bp;
 				int			j;
 
-				if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
-					return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
+				ap = POSDATAPTR(lex1, len1);
+				bp = POSDATAPTR(lex2, len2);
+
+				if (npos1 != npos2)
+					return (npos1 > npos2) ? -1 : 1;
 
-				for (j = 0; j < POSDATALEN(a, aptr); j++)
+				for (j = 0; j < npos1; j++)
 				{
 					if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
 					{
@@ -125,8 +137,8 @@ silly_cmp_tsvector(const TSVector a, const TSVector b)
 				}
 			}
 
-			aptr++;
-			bptr++;
+			INCRPTR(a, aptr, pos1);
+			INCRPTR(b, bptr, pos2);
 		}
 	}
 
@@ -161,27 +173,29 @@ tsvector_strip(PG_FUNCTION_ARGS)
 	TSVector	in = PG_GETARG_TSVECTOR(0);
 	TSVector	out;
 	int			i,
+				count,
+				posout = 0,
+				pos,
 				len = 0;
-	WordEntry  *arrin = ARRPTR(in),
-			   *arrout;
-	char	   *cur;
+	WordEntry  *entryin = ARRPTR(in);
 
-	for (i = 0; i < in->size; i++)
-		len += arrin[i].len;
+	count = TS_COUNT(in);
+	for (i = 0; i < count; i++)
+		INCRSIZE(len, i, ENTRY_LEN(in, ARRPTR(in) + i), 0);
 
-	len = CALCDATASIZE(in->size, len);
+	len = CALCDATASIZE(count, len);
 	out = (TSVector) palloc0(len);
 	SET_VARSIZE(out, len);
-	out->size = in->size;
-	arrout = ARRPTR(out);
-	cur = STRPTR(out);
-	for (i = 0; i < in->size; i++)
+	TS_SETCOUNT(out, count);
+
+	INITPOS(pos);
+	for (i = 0; i < count; i++)
 	{
-		memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
-		arrout[i].haspos = 0;
-		arrout[i].len = arrin[i].len;
-		arrout[i].pos = cur - STRPTR(out);
-		cur += arrout[i].len;
+		tsvector_addlexeme(out, i, &posout,
+				STRPTR(in) + pos, ENTRY_LEN(in, entryin),
+				NULL, 0);
+
+		INCRPTR(in, entryin, pos);
 	}
 
 	PG_FREE_IF_COPY(in, 0);
@@ -192,7 +206,7 @@ Datum
 tsvector_length(PG_FUNCTION_ARGS)
 {
 	TSVector	in = PG_GETARG_TSVECTOR(0);
-	int32		ret = in->size;
+	int32		ret = TS_COUNT(in);
 
 	PG_FREE_IF_COPY(in, 0);
 	PG_RETURN_INT32(ret);
@@ -204,11 +218,10 @@ tsvector_setweight(PG_FUNCTION_ARGS)
 	TSVector	in = PG_GETARG_TSVECTOR(0);
 	char		cw = PG_GETARG_CHAR(1);
 	TSVector	out;
-	int			i,
-				j;
-	WordEntry  *entry;
-	WordEntryPos *p;
+	int			i;
+	WordEntry  *weptr;
 	int			w = 0;
+	uint32		pos;
 
 	switch (cw)
 	{
@@ -235,20 +248,21 @@ tsvector_setweight(PG_FUNCTION_ARGS)
 
 	out = (TSVector) palloc(VARSIZE(in));
 	memcpy(out, in, VARSIZE(in));
-	entry = ARRPTR(out);
-	i = out->size;
-	while (i--)
+	weptr = ARRPTR(out);
+
+	INITPOS(pos);
+	for (i = 0; i < TS_COUNT(out); i++)
 	{
-		if ((j = POSDATALEN(out, entry)) != 0)
+		int j,
+			npos = ENTRY_NPOS(out, weptr);
+
+		if (npos)
 		{
-			p = POSDATAPTR(out, entry);
-			while (j--)
-			{
-				WEP_SETWEIGHT(*p, w);
-				p++;
-			}
+			WordEntryPos *p = POSDATAPTR(STRPTR(out) + pos, ENTRY_LEN(out, weptr));
+			for (j = 0; j < npos; j++)
+				WEP_SETWEIGHT(p[j], w);
 		}
-		entry++;
+		INCRPTR(out, weptr, pos);
 	}
 
 	PG_FREE_IF_COPY(in, 0);
@@ -269,10 +283,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
 
 	TSVector	tsout;
 	int			i,
-				j,
 				nlexemes,
 				weight;
-	WordEntry  *entry;
 	Datum	   *dlexemes;
 	bool	   *nulls;
 
@@ -301,8 +313,6 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
 
 	tsout = (TSVector) palloc(VARSIZE(tsin));
 	memcpy(tsout, tsin, VARSIZE(tsin));
-	entry = ARRPTR(tsout);
-
 	deconstruct_array(lexemes, TEXTOID, -1, false, 'i',
 					  &dlexemes, &nulls, &nlexemes);
 
@@ -315,7 +325,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
 	{
 		char	   *lex;
 		int			lex_len,
-					lex_pos;
+					lex_idx,
+					npos;
 
 		if (nulls[i])
 			ereport(ERROR,
@@ -324,17 +335,19 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
 
 		lex = VARDATA(dlexemes[i]);
 		lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
-		lex_pos = tsvector_bsearch(tsout, lex, lex_len);
+		lex_idx = tsvector_bsearch(tsin, lex, lex_len);
+		npos = ENTRY_NPOS(tsin, ARRPTR(tsout) + lex_idx);
 
-		if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
+		if (lex_idx >= 0 && npos > 0)
 		{
-			WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
+			int			j;
+			WordEntry  *we;
+			char	   *lexeme = tsvector_getlexeme(tsout, lex_idx, &we);
 
-			while (j--)
-			{
-				WEP_SETWEIGHT(*p, weight);
-				p++;
-			}
+			WordEntryPos   *p = POSDATAPTR(lexeme, we->len);
+
+			for (j = 0; j < npos; j++)
+				WEP_SETWEIGHT(p[j], weight);
 		}
 	}
 
@@ -354,34 +367,27 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
  * Return the number added (might be less than expected due to overflow)
  */
 static int32
-add_pos(TSVector src, WordEntry *srcptr,
-		TSVector dest, WordEntry *destptr,
+add_pos(char *src, WordEntry *srcptr,
+		WordEntryPos *dest, int from,
 		int32 maxpos)
 {
-	uint16	   *clen = &_POSVECPTR(dest, destptr)->npos;
+	uint16	    clen = from;
 	int			i;
-	uint16		slen = POSDATALEN(src, srcptr),
-				startlen;
-	WordEntryPos *spos = POSDATAPTR(src, srcptr),
-			   *dpos = POSDATAPTR(dest, destptr);
+	uint16		slen = srcptr->npos;
+	WordEntryPos *spos = POSDATAPTR(src, srcptr->len);
 
-	if (!destptr->haspos)
-		*clen = 0;
-
-	startlen = *clen;
+	Assert(!srcptr->hasoff);
 	for (i = 0;
-		 i < slen && *clen < MAXNUMPOS &&
-		 (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
+		 i < slen && clen < MAXNUMPOS &&
+		 (clen == 0 || WEP_GETPOS(dest[clen - 1]) != MAXENTRYPOS - 1);
 		 i++)
 	{
-		WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
-		WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
-		(*clen)++;
+		WEP_SETWEIGHT(dest[clen], WEP_GETWEIGHT(spos[i]));
+		WEP_SETPOS(dest[clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
+		clen++;
 	}
 
-	if (*clen != startlen)
-		destptr->haspos = 1;
-	return *clen - startlen;
+	return clen - from;
 }
 
 /*
@@ -392,20 +398,20 @@ add_pos(TSVector src, WordEntry *srcptr,
 static int
 tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
 {
-	WordEntry  *arrin = ARRPTR(tsv);
 	int			StopLow = 0,
-				StopHigh = tsv->size,
+				StopHigh = TS_COUNT(tsv),
 				StopMiddle,
 				cmp;
 
 	while (StopLow < StopHigh)
 	{
-		StopMiddle = (StopLow + StopHigh) / 2;
+		WordEntry  *entry = NULL;
+		char	   *str;
 
+		StopMiddle = (StopLow + StopHigh) / 2;
+		str = tsvector_getlexeme(tsv, StopMiddle, &entry);
 		cmp = tsCompareString(lexeme, lexeme_len,
-							  STRPTR(tsv) + arrin[StopMiddle].pos,
-							  arrin[StopMiddle].len,
-							  false);
+							  str, entry->len, false);
 
 		if (cmp < 0)
 			StopHigh = StopMiddle;
@@ -460,14 +466,12 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 						   int indices_count)
 {
 	TSVector	tsout;
-	WordEntry  *arrin = ARRPTR(tsv),
-			   *arrout;
-	char	   *data = STRPTR(tsv),
-			   *dataout;
-	int			i,				/* index in arrin */
-				j,				/* index in arrout */
+	WordEntry  *ptr = ARRPTR(tsv);
+	int			i,				/* index in input tsvector */
+				j,				/* index in output tsvector */
 				k,				/* index in indices_to_delete */
-				curoff;			/* index in dataout area */
+				curoff = 0,		/* index in data area of output */
+				pos;
 
 	/*
 	 * Sort the filter array to simplify membership checks below.  Also, get
@@ -495,16 +499,18 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	tsout = (TSVector) palloc0(VARSIZE(tsv));
 
 	/* This count must be correct because STRPTR(tsout) relies on it. */
-	tsout->size = tsv->size - indices_count;
+	TS_SETCOUNT(tsout, TS_COUNT(tsv) - indices_count);
 
 	/*
 	 * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
 	 */
-	arrout = ARRPTR(tsout);
-	dataout = STRPTR(tsout);
-	curoff = 0;
-	for (i = j = k = 0; i < tsv->size; i++)
+
+	INITPOS(pos);
+	for (i = j = k = 0; i < TS_COUNT(tsv); i++)
 	{
+		char	*lex = STRPTR(tsv) + pos;
+		int		 lex_len = ENTRY_LEN(tsv, ptr);
+
 		/*
 		 * If current i is present in indices_to_delete, skip this lexeme.
 		 * Since indices_to_delete is already sorted, we only need to check
@@ -513,28 +519,14 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 		if (k < indices_count && i == indices_to_delete[k])
 		{
 			k++;
-			continue;
+			goto next;
 		}
 
-		/* Copy lexeme and its positions and weights */
-		memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
-		arrout[j].haspos = arrin[i].haspos;
-		arrout[j].len = arrin[i].len;
-		arrout[j].pos = curoff;
-		curoff += arrin[i].len;
-		if (arrin[i].haspos)
-		{
-			int			len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
-			+ sizeof(uint16);
-
-			curoff = SHORTALIGN(curoff);
-			memcpy(dataout + curoff,
-				   STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
-				   len);
-			curoff += len;
-		}
+		tsvector_addlexeme(tsout, j++, &curoff, lex, lex_len,
+				POSDATAPTR(lex, lex_len), ENTRY_NPOS(tsv, ptr));
 
-		j++;
+next:
+		INCRPTR(tsv, ptr, pos);
 	}
 
 	/*
@@ -543,8 +535,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 * estimation of tsout's size is wrong.
 	 */
 	Assert(k == indices_count);
-
-	SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
+	SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), curoff));
 	return tsout;
 }
 
@@ -635,8 +626,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
 Datum
 tsvector_unnest(PG_FUNCTION_ARGS)
 {
-	FuncCallContext *funcctx;
-	TSVector	tsin;
+	FuncCallContext	   *funcctx;
+	TSVector			tsin;
+	uint32				pos;
 
 	if (SRF_IS_FIRSTCALL())
 	{
@@ -655,31 +647,33 @@ tsvector_unnest(PG_FUNCTION_ARGS)
 						   TEXTARRAYOID, -1, 0);
 		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
 
-		funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
+		INITPOS(pos);
+		funcctx->user_fctx = list_make2(PG_GETARG_TSVECTOR(0), makeInteger(pos));
 
 		MemoryContextSwitchTo(oldcontext);
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
-	tsin = (TSVector) funcctx->user_fctx;
+	tsin = (TSVector) linitial(funcctx->user_fctx);
+	pos = intVal(lsecond(funcctx->user_fctx));
 
-	if (funcctx->call_cntr < tsin->size)
+	if (funcctx->call_cntr < TS_COUNT(tsin))
 	{
-		WordEntry  *arrin = ARRPTR(tsin);
+		WordEntry  *entry = ARRPTR(tsin) + funcctx->call_cntr;
 		char	   *data = STRPTR(tsin);
 		HeapTuple	tuple;
 		int			j,
-					i = funcctx->call_cntr;
+					npos = ENTRY_NPOS(tsin, entry),
+					lex_len = ENTRY_LEN(tsin, entry);
 		bool		nulls[] = {false, false, false};
 		Datum		values[3];
 
 		values[0] = PointerGetDatum(
-									cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len)
-			);
+			cstring_to_text_with_len(data + pos, lex_len));
 
-		if (arrin[i].haspos)
+		if (npos)
 		{
-			WordEntryPosVector *posv;
+			WordEntryPos *apos = POSDATAPTR(data + pos, lex_len);
 			Datum	   *positions;
 			Datum	   *weights;
 			char		weight;
@@ -689,28 +683,28 @@ tsvector_unnest(PG_FUNCTION_ARGS)
 			 * uint16 (2 bits for weight, 14 for position). Here we extract
 			 * that in two separate arrays.
 			 */
-			posv = _POSVECPTR(tsin, arrin + i);
-			positions = palloc(posv->npos * sizeof(Datum));
-			weights = palloc(posv->npos * sizeof(Datum));
-			for (j = 0; j < posv->npos; j++)
+			positions = palloc(npos * sizeof(Datum));
+			weights = palloc(npos * sizeof(Datum));
+			for (j = 0; j < npos; j++)
 			{
-				positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
-				weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
+				positions[j] = Int16GetDatum(WEP_GETPOS(apos[j]));
+				weight = 'D' - WEP_GETWEIGHT(apos[j]);
 				weights[j] = PointerGetDatum(
 											 cstring_to_text_with_len(&weight, 1)
 					);
 			}
 
 			values[1] = PointerGetDatum(
-										construct_array(positions, posv->npos, INT2OID, 2, true, 's'));
+					construct_array(positions, npos, INT2OID, 2, true, 's'));
 			values[2] = PointerGetDatum(
-										construct_array(weights, posv->npos, TEXTOID, -1, false, 'i'));
+					construct_array(weights, npos, TEXTOID, -1, false, 'i'));
 		}
 		else
 		{
 			nulls[1] = nulls[2] = true;
 		}
 
+		INCRPTR(tsin, entry, intVal(lsecond(funcctx->user_fctx)));
 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
 		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
 	}
@@ -728,21 +722,23 @@ Datum
 tsvector_to_array(PG_FUNCTION_ARGS)
 {
 	TSVector	tsin = PG_GETARG_TSVECTOR(0);
-	WordEntry  *arrin = ARRPTR(tsin);
+	WordEntry  *entry = ARRPTR(tsin);
 	Datum	   *elements;
 	int			i;
 	ArrayType  *array;
+	long		pos;
 
-	elements = palloc(tsin->size * sizeof(Datum));
+	elements = palloc(TS_COUNT(tsin) * sizeof(Datum));
 
-	for (i = 0; i < tsin->size; i++)
+	INITPOS(pos);
+	for (i = 0; i < TS_COUNT(tsin); i++)
 	{
 		elements[i] = PointerGetDatum(
-									  cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len)
-			);
+		  cstring_to_text_with_len(STRPTR(tsin) + pos, ENTRY_LEN(tsin, entry)));
+		INCRPTR(tsin, entry, pos);
 	}
 
-	array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i');
+	array = construct_array(elements, TS_COUNT(tsin), TEXTOID, -1, false, 'i');
 
 	pfree(elements);
 	PG_FREE_IF_COPY(tsin, 0);
@@ -750,6 +746,124 @@ tsvector_to_array(PG_FUNCTION_ARGS)
 }
 
 /*
+ * Returns offset by given index in TSVector,
+ * this function used when we need random access
+ */
+int
+tsvector_getoffset(TSVector vec, int idx, WordEntry **we)
+{
+	int			offset = 0;
+	WordEntry  *entry;
+
+	entry = ARRPTR(vec) + idx;
+	if (we)
+		*we = entry;
+
+	while (!entry->hasoff)
+	{
+		entry--;
+		if (!entry->hasoff)
+			offset += SHORTALIGN(entry->len) + entry->npos * sizeof(WordEntryPos);
+	}
+
+	Assert(entry >= ARRPTR(vec));
+
+	if (idx % TS_OFFSET_STRIDE)
+	{
+		/* if idx is by offset */
+		WordEntry *offset_entry = (WordEntry *) (STRPTR(vec) + entry->offset);
+
+		offset += entry->offset + sizeof(WordEntry);
+		offset += SHORTALIGN(offset_entry->len) + offset_entry->npos * sizeof(WordEntryPos);
+	}
+	else
+	{
+		Assert(entry == ARRPTR(vec) + idx);
+
+		if (we)
+			*we = (WordEntry *) (STRPTR(vec) + entry->offset);
+		offset = entry->offset + sizeof(WordEntry);
+	}
+
+	return offset;
+}
+
+/*
+ * Add lexeme and its positions to tsvector and move dataoff (offset where
+ * data should be added) to new position.
+ * Returns pointer to lexeme start
+ */
+char *
+tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+		char *lexeme, int lexeme_len, WordEntryPos *pos, int npos)
+{
+	int			stroff;
+	WordEntry  *entry;
+	char	   *result;
+
+	/* when idx is 0, dataoff should be 0 too, and otherwise */
+	Assert(!((idx == 0) ^ (*dataoff == 0)));
+
+	stroff = *dataoff;
+	entry = ARRPTR(tsv) + idx;
+
+	if (idx % TS_OFFSET_STRIDE == 0)
+	{
+		/* WordEntry with offset */
+		WordEntry offentry;
+
+		stroff = INTALIGN(stroff);
+		entry->hasoff = 1;
+		entry->offset = stroff;
+
+		/* fill WordEntry for offset */
+		offentry.hasoff = 0;
+		offentry.len = lexeme_len;
+		offentry.npos = npos;
+		memcpy(STRPTR(tsv) + stroff, &offentry, sizeof(WordEntry));
+		stroff += sizeof(WordEntry);
+	}
+	else
+	{
+		stroff = SHORTALIGN(stroff);							\
+		entry->hasoff = 0;
+		entry->len = lexeme_len;
+		entry->npos = npos;
+	}
+
+	memcpy(STRPTR(tsv) + stroff, lexeme, lexeme_len);
+	result = STRPTR(tsv) + stroff;
+	stroff += lexeme_len;
+
+	if (npos)
+	{
+		if (npos > 0xFFFF)
+			elog(ERROR, "positions array too long");
+
+		/*
+		 * Pad to 2-byte alignment if necessary. We don't know how memory was
+		 * allocated, so in case of aligning we need to make sure that unused
+		 * is zero.
+		 */
+		if (stroff != SHORTALIGN(stroff))
+		{
+			*(STRPTR(tsv) + stroff) = '\0';
+			stroff = SHORTALIGN(stroff);
+		}
+
+		/* Copy positions */
+		if (pos)
+			memcpy(STRPTR(tsv) + stroff, pos, npos * sizeof(WordEntryPos));
+
+		stroff += npos * sizeof(WordEntryPos);
+	}
+
+	*dataoff = stroff;
+
+	return result;
+}
+
+/*
  * Build tsvector from array of lexemes.
  */
 Datum
@@ -758,14 +872,13 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
 	TSVector	tsout;
 	Datum	   *dlexemes;
-	WordEntry  *arrout;
 	bool	   *nulls;
 	int			nitems,
 				i,
 				j,
 				tslen,
+				cur = 0,
 				datalen = 0;
-	char	   *cur;
 
 	deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems);
 
@@ -793,26 +906,23 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 
 	/* Calculate space needed for surviving lexemes. */
 	for (i = 0; i < nitems; i++)
-		datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
+	{
+		int	lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
+		INCRSIZE(datalen, i, lex_len, 0);
+	}
 	tslen = CALCDATASIZE(nitems, datalen);
 
 	/* Allocate and fill tsvector. */
 	tsout = (TSVector) palloc0(tslen);
 	SET_VARSIZE(tsout, tslen);
-	tsout->size = nitems;
+	TS_SETCOUNT(tsout, nitems);
 
-	arrout = ARRPTR(tsout);
-	cur = STRPTR(tsout);
 	for (i = 0; i < nitems; i++)
 	{
 		char	   *lex = VARDATA(dlexemes[i]);
 		int			lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
 
-		memcpy(cur, lex, lex_len);
-		arrout[i].haspos = 0;
-		arrout[i].len = lex_len;
-		arrout[i].pos = cur - STRPTR(tsout);
-		cur += lex_len;
+		tsvector_addlexeme(tsout, i, &cur, lex, lex_len, NULL, 0);
 	}
 
 	PG_FREE_IF_COPY(v, 0);
@@ -828,17 +938,16 @@ tsvector_filter(PG_FUNCTION_ARGS)
 	TSVector	tsin = PG_GETARG_TSVECTOR(0),
 				tsout;
 	ArrayType  *weights = PG_GETARG_ARRAYTYPE_P(1);
-	WordEntry  *arrin = ARRPTR(tsin),
-			   *arrout;
-	char	   *datain = STRPTR(tsin),
-			   *dataout;
+	char	   *dataout;
 	Datum	   *dweights;
 	bool	   *nulls;
 	int			nweights;
 	int			i,
-				j;
-	int			cur_pos = 0;
+				j,
+				dataoff = 0,
+				pos;
 	char		mask = 0;
+	WordEntry  *ptr = ARRPTR(tsin);
 
 	deconstruct_array(weights, CHAROID, 1, true, 'c',
 					  &dweights, &nulls, &nweights);
@@ -879,109 +988,112 @@ tsvector_filter(PG_FUNCTION_ARGS)
 	}
 
 	tsout = (TSVector) palloc0(VARSIZE(tsin));
-	tsout->size = tsin->size;
-	arrout = ARRPTR(tsout);
+	TS_SETCOUNT(tsout, TS_COUNT(tsin));
 	dataout = STRPTR(tsout);
 
-	for (i = j = 0; i < tsin->size; i++)
+	INITPOS(pos);
+	for (i = j = 0; i < TS_COUNT(tsin); i++)
 	{
-		WordEntryPosVector *posvin,
-				   *posvout;
-		int			npos = 0;
-		int			k;
-
-		if (!arrin[i].haspos)
-			continue;
-
-		posvin = _POSVECPTR(tsin, arrin + i);
-		posvout = (WordEntryPosVector *)
-			(dataout + SHORTALIGN(cur_pos + arrin[i].len));
-
-		for (k = 0; k < posvin->npos; k++)
+		WordEntryPos   *posin,
+					   *posout;
+		int				k,
+						npos = 0,
+						lex_len = ENTRY_LEN(tsin, ptr);
+		char		   *lex = STRPTR(tsin) + pos,
+					   *lexout;
+
+		posin = POSDATAPTR(lex, lex_len);
+		for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
 		{
-			if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
-				posvout->pos[npos++] = posvin->pos[k];
+			if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+				npos++;
 		}
 
-		/* if no satisfactory positions found, skip lexeme */
 		if (!npos)
-			continue;
+			goto next;
 
-		arrout[j].haspos = true;
-		arrout[j].len = arrin[i].len;
-		arrout[j].pos = cur_pos;
+		lexout = tsvector_addlexeme(tsout, j++, &dataoff, lex, lex_len,
+				NULL, npos);
+		posout =  POSDATAPTR(lexout, lex_len);
+		npos = 0;
+		for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++)
+		{
+			if (mask & (1 << WEP_GETWEIGHT(posin[k])))
+				posout[npos++] = posin[k];
+		}
 
-		memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
-		posvout->npos = npos;
-		cur_pos += SHORTALIGN(arrin[i].len);
-		cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
-			sizeof(uint16);
-		j++;
+next:
+		INCRPTR(tsin, ptr, pos);
 	}
 
-	tsout->size = j;
+	TS_SETCOUNT(tsout, j);
 	if (dataout != STRPTR(tsout))
-		memmove(STRPTR(tsout), dataout, cur_pos);
+		memmove(STRPTR(tsout), dataout, dataoff);
 
-	SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
+	SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), dataoff));
 
 	PG_FREE_IF_COPY(tsin, 0);
 	PG_RETURN_POINTER(tsout);
 }
 
+/* Get max position in in1; we'll need this to offset in2's positions */
+static int
+get_maxpos(TSVector tsv)
+{
+	int				i,
+					j,
+					maxpos = 0;
+	WordEntry	   *ptr = ARRPTR(tsv);
+	uint32			pos;
+	WordEntryPos   *apos;
+
+	INITPOS(pos);
+	for (i = 0; i < TS_COUNT(tsv); i++)
+	{
+		apos = POSDATAPTR(STRPTR(tsv) + pos, ENTRY_LEN(tsv, ptr));
+		for (j = 0; j < ENTRY_NPOS(tsv, ptr); j++)
+		{
+			if (WEP_GETPOS(apos[j]) > maxpos)
+				maxpos = WEP_GETPOS(apos[j]);
+		}
+
+		INCRPTR(tsv, ptr, pos);
+	}
+
+	return maxpos;
+}
+
 Datum
 tsvector_concat(PG_FUNCTION_ARGS)
 {
-	TSVector	in1 = PG_GETARG_TSVECTOR(0);
-	TSVector	in2 = PG_GETARG_TSVECTOR(1);
-	TSVector	out;
-	WordEntry  *ptr;
-	WordEntry  *ptr1,
+	TSVector	in1 = PG_GETARG_TSVECTOR(0),
+				in2 = PG_GETARG_TSVECTOR(1),
+				out;
+	WordEntry  *ptr,
+			   *ptr1,
 			   *ptr2;
-	WordEntryPos *p;
 	int			maxpos = 0,
 				i,
-				j,
 				i1,
 				i2,
-				dataoff,
 				output_bytes,
-				output_size;
-	char	   *data,
-			   *data1,
-			   *data2;
-
-	/* Get max position in in1; we'll need this to offset in2's positions */
-	ptr = ARRPTR(in1);
-	i = in1->size;
-	while (i--)
-	{
-		if ((j = POSDATALEN(in1, ptr)) != 0)
-		{
-			p = POSDATAPTR(in1, ptr);
-			while (j--)
-			{
-				if (WEP_GETPOS(*p) > maxpos)
-					maxpos = WEP_GETPOS(*p);
-				p++;
-			}
-		}
-		ptr++;
-	}
+				pos1,
+				pos2,
+				dataoff;
+	char	   *data;
 
 	ptr1 = ARRPTR(in1);
 	ptr2 = ARRPTR(in2);
-	data1 = STRPTR(in1);
-	data2 = STRPTR(in2);
-	i1 = in1->size;
-	i2 = in2->size;
+	i1 = TS_COUNT(in1);
+	i2 = TS_COUNT(in2);
 
 	/*
 	 * Conservative estimate of space needed.  We might need all the data in
-	 * both inputs, and conceivably add a pad byte before position data for
-	 * each item where there was none before.
+	 * both inputs, and conceivably add a pad bytes before lexeme and position
+	 * data, and pad bytes before WordEntry for offset entry.
 	 */
-	output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
+	output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 * 2 + i2 * 2;
+	output_bytes += 4 * (i1 + i2) / TS_OFFSET_STRIDE;
 
 	out = (TSVector) palloc0(output_bytes);
 	SET_VARSIZE(out, output_bytes);
@@ -990,91 +1102,110 @@ tsvector_concat(PG_FUNCTION_ARGS)
 	 * We must make out->size valid so that STRPTR(out) is sensible.  We'll
 	 * collapse out any unused space at the end.
 	 */
-	out->size = in1->size + in2->size;
+	TS_SETCOUNT(out, i1 + i2);
 
-	ptr = ARRPTR(out);
+	ptr = NULL;
 	data = STRPTR(out);
+	i = 0;
 	dataoff = 0;
+
+	INITPOS(pos1);
+	INITPOS(pos2);
+
+	/*
+	 * we will need max position from first tsvector to add it positions of
+	 * second tsvector
+	 */
+	maxpos = get_maxpos(in1);
+
 	while (i1 && i2)
 	{
-		int			cmp = compareEntry(data1, ptr1, data2, ptr2);
+		char   *lex = STRPTR(in1) + pos1,
+			   *lex2 = STRPTR(in2) + pos2;
+
+		int		lex_len = ENTRY_LEN(in1, ptr1),
+				lex2_len = ENTRY_LEN(in2, ptr2);
+
+		int		cmp = tsCompareString(lex, lex_len,	lex2, lex2_len,	false);
 
 		if (cmp < 0)
 		{						/* in1 first */
-			ptr->haspos = ptr1->haspos;
-			ptr->len = ptr1->len;
-			memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
-			ptr->pos = dataoff;
-			dataoff += ptr1->len;
-			if (ptr->haspos)
-			{
-				dataoff = SHORTALIGN(dataoff);
-				memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
-				dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
-			}
+			tsvector_addlexeme(out, i, &dataoff,
+					lex, lex_len,
+					POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
 
-			ptr++;
-			ptr1++;
+			INCRPTR(in1, ptr1, pos1);
 			i1--;
+			i++;
 		}
 		else if (cmp > 0)
 		{						/* in2 first */
-			ptr->haspos = ptr2->haspos;
-			ptr->len = ptr2->len;
-			memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
-			ptr->pos = dataoff;
-			dataoff += ptr2->len;
-			if (ptr->haspos)
+			char		*new_lex;
+			WordEntry	*we = UNWRAP_ENTRY(in2, ptr2);
+
+			new_lex = tsvector_addlexeme(out, i, &dataoff, lex2, lex2_len, NULL, 0);
+			if (we->npos > 0)
 			{
-				int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+				int				addlen;
+				WordEntryPos   *apos = POSDATAPTR(new_lex, lex2_len);
 
-				if (addlen == 0)
-					ptr->haspos = 0;
-				else
+				addlen = add_pos(lex2, we, apos, 0, maxpos);
+				if (addlen > 0)
 				{
+					ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+					ptr->npos = addlen;
 					dataoff = SHORTALIGN(dataoff);
-					dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+					dataoff += ptr->npos * sizeof(WordEntryPos);
 				}
 			}
 
-			ptr++;
-			ptr2++;
+			INCRPTR(in2, ptr2, pos2);
+			i++;
 			i2--;
 		}
 		else
 		{
-			ptr->haspos = ptr1->haspos | ptr2->haspos;
-			ptr->len = ptr1->len;
-			memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
-			ptr->pos = dataoff;
-			dataoff += ptr1->len;
-			if (ptr->haspos)
+			char		   *new_lex;
+			int				npos1 = ENTRY_NPOS(in1, ptr1),
+							npos2 = ENTRY_NPOS(in2, ptr2);
+			WordEntryPos   *apos;
+
+			new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+			apos = POSDATAPTR(new_lex, lex_len);
+
+			if (npos1 || npos2)
 			{
-				if (ptr1->haspos)
-				{
-					dataoff = SHORTALIGN(dataoff);
-					memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
-					dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
-					if (ptr2->haspos)
-						dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
-				}
-				else			/* must have ptr2->haspos */
+				int			addlen;
+				char	   *lex2 = STRPTR(in2) + pos2;
+
+				ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+				if (npos1)
 				{
-					int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+					/* add positions from left tsvector */
+					addlen = add_pos(lex, UNWRAP_ENTRY(in1, ptr1), apos, 0, 0);
+					ptr->npos = addlen;
 
-					if (addlen == 0)
-						ptr->haspos = 0;
-					else
+					if (npos2)
 					{
-						dataoff = SHORTALIGN(dataoff);
-						dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+						/* add positions from right right tsvector */
+						addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, addlen, maxpos);
+						ptr->npos += addlen;
 					}
 				}
+				else	/* npos in second should be > 0 */
+				{
+					/* add positions from right tsvector */
+					addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+					ptr->npos = addlen;
+				}
+
+				dataoff = SHORTALIGN(dataoff);
+				dataoff += ptr->npos * sizeof(WordEntryPos);
 			}
 
-			ptr++;
-			ptr1++;
-			ptr2++;
+			INCRPTR(in1, ptr1, pos1);
+			INCRPTR(in2, ptr2, pos2);
+			i++;
 			i1--;
 			i2--;
 		}
@@ -1082,45 +1213,43 @@ tsvector_concat(PG_FUNCTION_ARGS)
 
 	while (i1)
 	{
-		ptr->haspos = ptr1->haspos;
-		ptr->len = ptr1->len;
-		memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
-		ptr->pos = dataoff;
-		dataoff += ptr1->len;
-		if (ptr->haspos)
-		{
-			dataoff = SHORTALIGN(dataoff);
-			memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
-			dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
-		}
+		char   *lex = STRPTR(in1) + pos1;
+		int		lex_len = ENTRY_LEN(in1, ptr1);
 
-		ptr++;
-		ptr1++;
+		tsvector_addlexeme(out, i, &dataoff,
+				lex, lex_len,
+				POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1));
+
+		INCRPTR(in1, ptr1, pos1);
+		i++;
 		i1--;
 	}
 
 	while (i2)
 	{
-		ptr->haspos = ptr2->haspos;
-		ptr->len = ptr2->len;
-		memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
-		ptr->pos = dataoff;
-		dataoff += ptr2->len;
-		if (ptr->haspos)
+		char   *lex = STRPTR(in2) + pos2,
+			   *new_lex;
+		int		lex_len = ENTRY_LEN(in2, ptr2),
+				npos = ENTRY_NPOS(in2, ptr2);
+
+		new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0);
+		if (npos > 0)
 		{
-			int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);
+			int				addlen;
+			WordEntryPos   *apos = POSDATAPTR(new_lex, lex_len);
 
-			if (addlen == 0)
-				ptr->haspos = 0;
-			else
+			addlen = add_pos(lex, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos);
+			if (addlen > 0)
 			{
+				WordEntry	*ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i);
+				ptr->npos = addlen;
 				dataoff = SHORTALIGN(dataoff);
-				dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
+				dataoff += npos * sizeof(WordEntryPos);
 			}
 		}
 
-		ptr++;
-		ptr2++;
+		INCRPTR(in2, ptr2, pos2);
+		i++;
 		i2--;
 	}
 
@@ -1137,12 +1266,10 @@ tsvector_concat(PG_FUNCTION_ARGS)
 	 * Adjust sizes (asserting that we didn't overrun the original estimates)
 	 * and collapse out any unused array entries.
 	 */
-	output_size = ptr - ARRPTR(out);
-	Assert(output_size <= out->size);
-	out->size = output_size;
+	TS_SETCOUNT(out, i);
 	if (data != STRPTR(out))
 		memmove(STRPTR(out), data, dataoff);
-	output_bytes = CALCDATASIZE(out->size, dataoff);
+	output_bytes = CALCDATASIZE(TS_COUNT(out), dataoff);
 	Assert(output_bytes <= VARSIZE(out));
 	SET_VARSIZE(out, output_bytes);
 
@@ -1194,35 +1321,26 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
  * Check weight info or/and fill 'data' with the required positions
  */
 static bool
-checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
+checkclass_str(WordEntryPos *pv, int npos, QueryOperand *val,
 			   ExecPhraseData *data)
 {
 	bool		result = false;
 
-	if (entry->haspos && (val->weight || data))
+	if (npos && (val->weight || data))
 	{
-		WordEntryPosVector *posvec;
-
-		/*
-		 * We can't use the _POSVECPTR macro here because the pointer to the
-		 * tsvector's lexeme storage is already contained in chkval->values.
-		 */
-		posvec = (WordEntryPosVector *)
-			(chkval->values + SHORTALIGN(entry->pos + entry->len));
-
 		if (val->weight && data)
 		{
-			WordEntryPos *posvec_iter = posvec->pos;
+			WordEntryPos *posvec_iter = pv;
 			WordEntryPos *dptr;
 
 			/*
 			 * Filter position information by weights
 			 */
-			dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
+			dptr = data->pos = palloc(sizeof(WordEntryPos) * npos);
 			data->allocated = true;
 
 			/* Is there a position with a matching weight? */
-			while (posvec_iter < posvec->pos + posvec->npos)
+			while (posvec_iter < (pv + npos))
 			{
 				/* If true, append this position to the data->pos */
 				if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
@@ -1241,10 +1359,10 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 		}
 		else if (val->weight)
 		{
-			WordEntryPos *posvec_iter = posvec->pos;
+			WordEntryPos *posvec_iter = pv;
 
 			/* Is there a position with a matching weight? */
-			while (posvec_iter < posvec->pos + posvec->npos)
+			while (posvec_iter < (pv + npos))
 			{
 				if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
 				{
@@ -1257,8 +1375,8 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 		}
 		else					/* data != NULL */
 		{
-			data->npos = posvec->npos;
-			data->pos = posvec->pos;
+			data->npos = npos;
+			data->pos = pv;
 			data->allocated = false;
 			result = true;
 		}
@@ -1311,26 +1429,32 @@ static bool
 checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	CHKVAL	   *chkval = (CHKVAL *) checkval;
-	WordEntry  *StopLow = chkval->arrb;
-	WordEntry  *StopHigh = chkval->arre;
-	WordEntry  *StopMiddle = StopHigh;
+	int			StopLow = chkval->bidx;
+	int			StopHigh = chkval->eidx;
+	int			StopMiddle = StopHigh;
 	int			difference = -1;
 	bool		res = false;
+	char	   *lexeme;
+	WordEntry  *entry;
 
 	/* Loop invariant: StopLow <= val < StopHigh */
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+		lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+		Assert(!entry->hasoff);
 		difference = tsCompareString(chkval->operand + val->distance,
 									 val->length,
-									 chkval->values + StopMiddle->pos,
-									 StopMiddle->len,
+									 lexeme,
+									 entry->len,
 									 false);
 
 		if (difference == 0)
 		{
 			/* Check weight info & fill 'data' with positions */
-			res = checkclass_str(chkval, StopMiddle, val, data);
+			res = checkclass_str(POSDATAPTR(lexeme, entry->len),
+								 entry->npos, val, data);
 			break;
 		}
 		else if (difference > 0)
@@ -1352,19 +1476,31 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		if (StopLow >= StopHigh)
 			StopMiddle = StopHigh;
 
-		while ((!res || data) && StopMiddle < chkval->arre &&
-			   tsCompareString(chkval->operand + val->distance,
-							   val->length,
-							   chkval->values + StopMiddle->pos,
-							   StopMiddle->len,
-							   true) == 0)
+		while ((!res || data) && StopMiddle < chkval->eidx)
 		{
+			char		   *lexeme;
+			int				cmp;
+			WordEntryPos   *pv;
+
+			lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry);
+
+			Assert(!entry->hasoff);
+			pv = POSDATAPTR(lexeme, entry->len);
+			cmp = tsCompareString(chkval->operand + val->distance,
+								  val->length,
+								  lexeme,
+								  entry->len,
+								  true);
+
+			if (cmp != 0)
+				break;
+
 			if (data)
 			{
 				/*
 				 * We need to join position information
 				 */
-				res = checkclass_str(chkval, StopMiddle, val, data);
+				res = checkclass_str(pv, entry->npos, val, data);
 
 				if (res)
 				{
@@ -1388,7 +1524,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 			}
 			else
 			{
-				res = checkclass_str(chkval, StopMiddle, val, NULL);
+				res = checkclass_str(pv, entry->npos, val, NULL);
 			}
 
 			StopMiddle++;
@@ -1935,9 +2071,9 @@ ts_match_vq(PG_FUNCTION_ARGS)
 		PG_RETURN_BOOL(false);
 	}
 
-	chkval.arrb = ARRPTR(val);
-	chkval.arre = chkval.arrb + val->size;
-	chkval.values = STRPTR(val);
+	chkval.bidx = 0;
+	chkval.eidx = TS_COUNT(val);
+	chkval.vec = val;
 	chkval.operand = GETOPERAND(query);
 	result = TS_execute(GETQUERY(query),
 						&chkval,
@@ -2001,12 +2137,15 @@ ts_match_tq(PG_FUNCTION_ARGS)
  * that have a weight equal to one of the weights in 'weight' bitmask.
  */
 static int
-check_weight(TSVector txt, WordEntry *wptr, int8 weight)
+check_weight(char *lexeme, WordEntry *wptr, int8 weight)
 {
-	int			len = POSDATALEN(txt, wptr);
-	int			num = 0;
-	WordEntryPos *ptr = POSDATAPTR(txt, wptr);
+	int				len;
+	int				num = 0;
+	WordEntryPos   *ptr;
 
+	Assert(!wptr->hasoff);
+	len = wptr->len;
+	ptr = POSDATAPTR(lexeme, len);
 	while (len--)
 	{
 		if (weight & (1 << WEP_GETWEIGHT(*ptr)))
@@ -2017,31 +2156,34 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight)
 }
 
 #define compareStatWord(a,e,t)							\
-	tsCompareString((a)->lexeme, (a)->lenlexeme,		\
-					STRPTR(t) + (e)->pos, (e)->len,		\
-					false)
+	(tsCompareString((a)->lexeme, (a)->lenlexeme,		\
+					t, (e)->len, false))
 
 static void
 insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
 {
-	WordEntry  *we = ARRPTR(txt) + off;
+	WordEntry  *we;
 	StatEntry  *node = stat->root,
 			   *pnode = NULL;
 	int			n,
 				res = 0;
 	uint32		depth = 1;
+	char	   *lexeme;
+
+	lexeme = tsvector_getlexeme(txt, off, &we);
 
+	Assert(!we->hasoff);
 	if (stat->weight == 0)
-		n = (we->haspos) ? POSDATALEN(txt, we) : 1;
+		n = (we->npos) ? we->npos : 1;
 	else
-		n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
+		n = (we->npos) ? check_weight(lexeme, we, stat->weight) : 0;
 
 	if (n == 0)
 		return;					/* nothing to insert */
 
 	while (node)
 	{
-		res = compareStatWord(node, we, txt);
+		res = compareStatWord(node, we, lexeme);
 
 		if (res == 0)
 		{
@@ -2065,7 +2207,7 @@ insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector tx
 		node->ndoc = 1;
 		node->nentry = n;
 		node->lenlexeme = we->len;
-		memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
+		memcpy(node->lexeme, lexeme, node->lenlexeme);
 
 		if (pnode == NULL)
 		{
@@ -2092,13 +2234,14 @@ chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVecto
 					uint32 low, uint32 high, uint32 offset)
 {
 	uint32		pos;
-	uint32		middle = (low + high) >> 1;
+	uint32		middle = (low + high) >> 1,
+				count = TS_COUNT(txt);
 
 	pos = (low + middle) >> 1;
-	if (low != middle && pos >= offset && pos - offset < txt->size)
+	if (low != middle && pos >= offset && pos - offset < count)
 		insertStatEntry(persistentContext, stat, txt, pos - offset);
 	pos = (high + middle + 1) >> 1;
-	if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
+	if (middle + 1 != high && pos >= offset && pos - offset < count)
 		insertStatEntry(persistentContext, stat, txt, pos - offset);
 
 	if (low != middle)
@@ -2125,7 +2268,8 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
 	TSVector	txt = DatumGetTSVector(data);
 	uint32		i,
 				nbit = 0,
-				offset;
+				offset,
+				count = TS_COUNT(txt);
 
 	if (stat == NULL)
 	{							/* Init in first */
@@ -2134,19 +2278,19 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
 	}
 
 	/* simple check of correctness */
-	if (txt == NULL || txt->size == 0)
+	if (txt == NULL || count == 0)
 	{
 		if (txt && txt != (TSVector) DatumGetPointer(data))
 			pfree(txt);
 		return stat;
 	}
 
-	i = txt->size - 1;
+	i = count - 1;
 	for (; i > 0; i >>= 1)
 		nbit++;
 
 	nbit = 1 << nbit;
-	offset = (nbit - txt->size) / 2;
+	offset = (nbit - count) / 2;
 
 	insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
 	chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
@@ -2579,15 +2723,28 @@ tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
 	}
 
 	/* make tsvector value */
-	datum = TSVectorGetDatum(make_tsvector(&prs));
-	isnull = false;
-
-	/* and insert it into tuple */
-	rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
-										 1, &tsvector_attr_num,
-										 &datum, &isnull);
-
-	pfree(DatumGetPointer(datum));
+	if (prs.curwords)
+	{
+		datum = PointerGetDatum(make_tsvector(&prs));
+		isnull = false;
+		rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+											 1, &tsvector_attr_num,
+											 &datum, &isnull);
+		pfree(DatumGetPointer(datum));
+	}
+	else
+	{
+		TSVector	out = palloc(CALCDATASIZE(0, 0));
+
+		SET_VARSIZE(out, CALCDATASIZE(0, 0));
+		TS_SETCOUNT(out, 0);
+		datum = PointerGetDatum(out);
+		isnull = false;
+		rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
+											 1, &tsvector_attr_num,
+											 &datum, &isnull);
+		pfree(prs.words);
+	}
 
 	return PointerGetDatum(rettuple);
 }
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 30d7c4bccd..47aa498432 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -24,30 +24,38 @@
  * 2) int32		size - number of lexemes (WordEntry array entries)
  * 3) Array of WordEntry - one per lexeme; must be sorted according to
  *				tsCompareString() (ie, memcmp of lexeme strings).
- *				WordEntry->pos gives the number of bytes from end of WordEntry
- *				array to start of lexeme's string, which is of length len.
+ *	  WordEntry have two types: offset or metadata (length of lexeme and number
+ *	  of positions). If it has offset then metadata will be by this offset.
  * 4) Per-lexeme data storage:
- *	  lexeme string (not null-terminated)
- *	  if haspos is true:
+ *    [4-byte aligned WordEntry] (if its WordEntry has offset)
+ *	  2-byte aligned lexeme string (not null-terminated)
+ *	  if it has positions:
  *		padding byte if necessary to make the position data 2-byte aligned
- *		uint16			number of positions that follow
  *		WordEntryPos[]	positions
  *
  * The positions for each lexeme must be sorted.
  *
- * Note, tsvectorsend/recv believe that sizeof(WordEntry) == 4
+ * Note, tsvector functions believe that sizeof(WordEntry) == 4
  */
 
-typedef struct
+#define TS_OFFSET_STRIDE 4
+
+typedef union
 {
-	uint32
-				haspos:1,
-				len:11,			/* MAX 2Kb */
-				pos:20;			/* MAX 1Mb */
+	struct {
+		uint32 hasoff: 1,
+			   offset: 31;
+	};
+	struct {
+		uint32 hasoff_: 1,
+			   len:11,
+			   npos: 16,
+			   _unused: 4;
+	};
 } WordEntry;
 
 #define MAXSTRLEN ( (1<<11) - 1)
-#define MAXSTRPOS ( (1<<20) - 1)
+#define MAXSTRPOS ( (1<<30) - 1)
 
 extern int	compareWordEntryPos(const void *a, const void *b);
 
@@ -62,19 +70,6 @@ extern int	compareWordEntryPos(const void *a, const void *b);
 
 typedef uint16 WordEntryPos;
 
-typedef struct
-{
-	uint16		npos;
-	WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER];
-} WordEntryPosVector;
-
-/* WordEntryPosVector with exactly 1 entry */
-typedef struct
-{
-	uint16		npos;
-	WordEntryPos pos[1];
-} WordEntryPosVector1;
-
 
 #define WEP_GETWEIGHT(x)	( (x) >> 14 )
 #define WEP_GETPOS(x)		( (x) & 0x3fff )
@@ -90,13 +85,17 @@ typedef struct
 typedef struct
 {
 	int32		vl_len_;		/* varlena header (do not touch directly!) */
-	int32		size;
+	int32		size_;			/* flags and lexemes count */
 	WordEntry	entries[FLEXIBLE_ARRAY_MEMBER];
 	/* lexemes follow the entries[] array */
 } TSVectorData;
 
 typedef TSVectorData *TSVector;
 
+#define TS_FLAG_STRETCHED 0x80000000
+#define TS_COUNT(t) ((t)->size_ & 0x0FFFFFFF)
+#define TS_SETCOUNT(t,c) ((t)->size_ = (c) | TS_FLAG_STRETCHED)
+
 #define DATAHDRSIZE (offsetof(TSVectorData, entries))
 #define CALCDATASIZE(nentries, lenstr) (DATAHDRSIZE + (nentries) * sizeof(WordEntry) + (lenstr) )
 
@@ -104,24 +103,65 @@ typedef TSVectorData *TSVector;
 #define ARRPTR(x)	( (x)->entries )
 
 /* pointer to start of a tsvector's lexeme storage */
-#define STRPTR(x)	( (char *) &(x)->entries[(x)->size] )
+#define STRPTR(x)	( (char *) &(x)->entries[TS_COUNT(x)] )
 
-#define _POSVECPTR(x, e)	((WordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len)))
-#define POSDATALEN(x,e) ( ( (e)->haspos ) ? (_POSVECPTR(x,e)->npos) : 0 )
-#define POSDATAPTR(x,e) (_POSVECPTR(x,e)->pos)
+/* for WordEntry with offset return its WordEntry with other properties */
+#define UNWRAP_ENTRY(x,we) \
+	((we)->hasoff? (WordEntry *)(STRPTR(x) + (we)->offset): (we))
+
+/*
+ * helpers used when we're not sure that WordEntry
+ * contains ether offset or len
+ */
+#define ENTRY_NPOS(x,we) (UNWRAP_ENTRY(x,we)->npos)
+#define ENTRY_LEN(x,we) (UNWRAP_ENTRY(x,we)->len)
+
+/* pointer to start of positions */
+#define POSDATAPTR(lex, len) ((WordEntryPos *) (lex + SHORTALIGN(len)))
+
+/* set default offset in tsvector data */
+#define INITPOS(p) ((p) = sizeof(WordEntry))
+
+/* increment entry and offset by given WordEntry */
+#define INCRPTR(x,w,p) \
+do { \
+	WordEntry *y = (w);									\
+	if ((w)->hasoff)									\
+	{													\
+		y = (WordEntry *) (STRPTR(x) + (w)->offset);	\
+		(p) = (w)->offset + sizeof(WordEntry);			\
+	}													\
+	(w)++;												\
+	Assert(!y->hasoff);									\
+	(p) += SHORTALIGN(y->len) + y->npos * sizeof(WordEntryPos); \
+	if ((w) - ARRPTR(x) < TS_COUNT(x) && w->hasoff)		\
+		(p) = INTALIGN(p) + sizeof(WordEntry);			\
+} while (0);
+
+/* used to calculate tsvector size in in tsvector constructors */
+#define INCRSIZE(s,i,l,n) /* size,index,len,npos */		\
+do {													\
+	if ((i) % TS_OFFSET_STRIDE == 0)					\
+		(s) = INTALIGN(s) + sizeof(WordEntry);			\
+	else												\
+		(s) = SHORTALIGN(s);							\
+	(s) += (l);											\
+	(s) = (n)? SHORTALIGN(s) + (n) * sizeof(WordEntryPos) : (s);	\
+} while (0);
 
 /*
  * fmgr interface macros
  */
 
-#define DatumGetTSVector(X)			((TSVector) PG_DETOAST_DATUM(X))
-#define DatumGetTSVectorCopy(X)		((TSVector) PG_DETOAST_DATUM_COPY(X))
+TSVector tsvector_upgrade(Datum orig, bool copy);
+
+#define DatumGetTSVector(X)			tsvector_upgrade((X), false)
+#define DatumGetTSVectorCopy(X)		tsvector_upgrade((X), true)
 #define TSVectorGetDatum(X)			PointerGetDatum(X)
 #define PG_GETARG_TSVECTOR(n)		DatumGetTSVector(PG_GETARG_DATUM(n))
 #define PG_GETARG_TSVECTOR_COPY(n)	DatumGetTSVectorCopy(PG_GETARG_DATUM(n))
 #define PG_RETURN_TSVECTOR(x)		return TSVectorGetDatum(x)
 
-
 /*
  * TSQuery
  *
@@ -239,4 +279,22 @@ typedef TSQueryData *TSQuery;
 #define PG_GETARG_TSQUERY_COPY(n)	DatumGetTSQueryCopy(PG_GETARG_DATUM(n))
 #define PG_RETURN_TSQUERY(x)		return TSQueryGetDatum(x)
 
+int tsvector_getoffset(TSVector vec, int idx, WordEntry **we);
+char *tsvector_addlexeme(TSVector tsv, int idx, int *dataoff,
+		char *lexeme, int lexeme_len, WordEntryPos *pos, int npos);
+
+/* Returns lexeme and its entry by given index from TSVector */
+inline static char *
+tsvector_getlexeme(TSVector vec, int idx, WordEntry **we)
+{
+	Assert(idx >=0 && idx < TS_COUNT(vec));
+
+	/*
+	 * we do not allow we == NULL because returned lexeme is not \0 ended,
+	 * and always should be used with we->len
+	 */
+	Assert(we != NULL);
+	return STRPTR(vec) + tsvector_getoffset(vec, idx, we);
+}
+
 #endif							/* _PG_TSTYPE_H_ */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] Remove 1MB size limit in tsvector

Reply via email to