On Thu, 10 Aug 2017 11:46:55 -0400 Tom Lane <t...@sss.pgh.pa.us> wrote:
> Alexander Korotkov <a.korot...@postgrespro.ru> writes: > > ... > > You have random mix of tabs and spaces here. > > It's worth running pgindent over your code before submitting. It > should be pretty easy to set that up nowadays, see > src/tools/pgindent/README. (If you find any portability problems > while trying to install pgindent, please let me know.) Attached a new version of the patch. It mostly contains cosmetic changes. I rebased it to current master, ran pgindent and fixed formatting errors. -- --- Ildus Kurbangaliev Postgres Professional: http://www.postgrespro.com Russian Postgres Company
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile index 34fe4c5b3c..9585a25003 100644 --- a/src/backend/tsearch/Makefile +++ b/src/backend/tsearch/Makefile @@ -26,7 +26,7 @@ DICTFILES_PATH=$(addprefix dicts/,$(DICTFILES)) OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ - to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o + to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o ts_compat.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c index 35d9ab276c..aa87fd8a04 100644 --- a/src/backend/tsearch/to_tsany.c +++ b/src/backend/tsearch/to_tsany.c @@ -156,13 +156,10 @@ TSVector make_tsvector(ParsedText *prs) { int i, - j, lenstr = 0, - totallen; + totallen, + stroff = 0; TSVector in; - WordEntry *ptr; - char *str; - int stroff; /* Merge duplicate words */ if (prs->curwords > 0) @@ -171,12 +168,9 @@ make_tsvector(ParsedText *prs) /* Determine space needed */ for (i = 0; i < prs->curwords; i++) { - lenstr += prs->words[i].len; - if (prs->words[i].alen) - { - lenstr = SHORTALIGN(lenstr); - lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); - } + int npos = prs->words[i].alen ? prs->words[i].pos.apos[0] : 0; + + INCRSIZE(lenstr, i, prs->words[i].len, npos); } if (lenstr > MAXSTRPOS) @@ -187,41 +181,21 @@ make_tsvector(ParsedText *prs) totallen = CALCDATASIZE(prs->curwords, lenstr); in = (TSVector) palloc0(totallen); SET_VARSIZE(in, totallen); - in->size = prs->curwords; + TS_SETCOUNT(in, prs->curwords); - ptr = ARRPTR(in); - str = STRPTR(in); - stroff = 0; for (i = 0; i < prs->curwords; i++) { - ptr->len = prs->words[i].len; - ptr->pos = stroff; - memcpy(str + stroff, prs->words[i].word, prs->words[i].len); - stroff += prs->words[i].len; - pfree(prs->words[i].word); + int npos = 0; + if (prs->words[i].alen) - { - int k = prs->words[i].pos.apos[0]; - WordEntryPos *wptr; + npos = prs->words[i].pos.apos[0]; - if (k > 0xFFFF) - elog(ERROR, "positions array too long"); + tsvector_addlexeme(in, i, &stroff, prs->words[i].word, prs->words[i].len, + prs->words[i].pos.apos + 1, npos); - ptr->haspos = 1; - stroff = SHORTALIGN(stroff); - *(uint16 *) (str + stroff) = (uint16) k; - wptr = POSDATAPTR(in, ptr); - for (j = 0; j < k; j++) - { - WEP_SETWEIGHT(wptr[j], 0); - WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); - } - stroff += sizeof(uint16) + k * sizeof(WordEntryPos); + pfree(prs->words[i].word); + if (prs->words[i].alen) pfree(prs->words[i].pos.apos); - } - else - ptr->haspos = 0; - ptr++; } if (prs->words) @@ -251,7 +225,6 @@ to_tsvector_byid(PG_FUNCTION_ARGS) PG_FREE_IF_COPY(in, 1); out = make_tsvector(&prs); - PG_RETURN_TSVECTOR(out); } diff --git a/src/backend/tsearch/ts_compat.c b/src/backend/tsearch/ts_compat.c new file mode 100644 index 0000000000..bc45109241 --- /dev/null +++ b/src/backend/tsearch/ts_compat.c @@ -0,0 +1,84 @@ +#include "postgres.h" +#include "tsearch/ts_type.h" + +/* + * Definition of old WordEntry struct in TSVector. Because of limitations + * in size (max 1MB for lexemes), the format has changed + */ +typedef struct +{ + uint32 + haspos:1, + len:11, + pos:20; +} OldWordEntry; + +typedef struct +{ + uint16 npos; + WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER]; +} OldWordEntryPosVector; + +#define OLDSTRPTR(x) ( (char *) &(x)->entries[x->size_] ) +#define _OLDPOSVECPTR(x, e) \ + ((OldWordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))) +#define OLDPOSDATALEN(x,e) ( ( (e)->haspos ) ? (_OLDPOSVECPTR(x,e)->npos) : 0 ) +#define OLDPOSDATAPTR(x,e) (_OLDPOSVECPTR(x,e)->pos) + +/* + * Converts tsvector with the old structure to current. + * Can return copy of tsvector, but it has a meaning when tsvector doensn't + * need to be converted. + */ +TSVector +tsvector_upgrade(Datum orig, bool copy) +{ + int i, + dataoff = 0, + datalen = 0, + totallen; + TSVector in, + out; + + in = (TSVector) PG_DETOAST_DATUM(orig); + + /* If already in new format, return as is */ + if (in->size_ & TS_FLAG_STRETCHED) + { + TSVector out; + + if (!copy) + return in; + + out = (TSVector) palloc(VARSIZE(in)); + memcpy(out, in, VARSIZE(in)); + return out; + } + + /* + * Calculate required size. We don't check any sizes here because old + * format was limited with 1MB + */ + for (i = 0; i < in->size_; i++) + { + OldWordEntry *entry = (OldWordEntry *) (in->entries + i); + + INCRSIZE(datalen, i, entry->len, OLDPOSDATALEN(in, entry)); + } + + totallen = CALCDATASIZE(in->size_, datalen); + out = (TSVector) palloc0(totallen); + SET_VARSIZE(out, totallen); + TS_SETCOUNT(out, in->size_); + + for (i = 0; i < in->size_; i++) + { + OldWordEntry *entry = (OldWordEntry *) (in->entries + i); + + tsvector_addlexeme(out, i, &dataoff, + OLDSTRPTR(in) + entry->pos, entry->len, + OLDPOSDATAPTR(in, entry), OLDPOSDATALEN(in, entry)); + } + + return out; +} diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 320c7f1a61..9b2fc4be04 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -202,7 +202,8 @@ compute_tsvector_stats(VacAttrStats *stats, TSVector vector; WordEntry *curentryptr; char *lexemesptr; - int j; + int j, + pos; vacuum_delay_point(); @@ -236,7 +237,9 @@ compute_tsvector_stats(VacAttrStats *stats, */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); - for (j = 0; j < vector->size; j++) + + INITPOS(pos); + for (j = 0; j < TS_COUNT(vector); j++) { bool found; @@ -246,8 +249,8 @@ compute_tsvector_stats(VacAttrStats *stats, * make a copy of it. This way we can free the tsvector value * once we've processed all its lexemes. */ - hash_key.lexeme = lexemesptr + curentryptr->pos; - hash_key.length = curentryptr->len; + hash_key.lexeme = lexemesptr + pos; + hash_key.length = ENTRY_LEN(vector, curentryptr); /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, @@ -280,7 +283,7 @@ compute_tsvector_stats(VacAttrStats *stats, } /* Advance to the next WordEntry in the tsvector */ - curentryptr++; + INCRPTR(vector, curentryptr, pos); } /* If the vector was toasted, free the detoasted copy. */ diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c index 83a939dfd5..75a4364b94 100644 --- a/src/backend/utils/adt/tsginidx.c +++ b/src/backend/utils/adt/tsginidx.c @@ -67,23 +67,27 @@ gin_extract_tsvector(PG_FUNCTION_ARGS) TSVector vector = PG_GETARG_TSVECTOR(0); int32 *nentries = (int32 *) PG_GETARG_POINTER(1); Datum *entries = NULL; + int tscount = TS_COUNT(vector); - *nentries = vector->size; - if (vector->size > 0) + *nentries = tscount; + if (tscount > 0) { int i; + uint32 pos; + WordEntry *we = ARRPTR(vector); - entries = (Datum *) palloc(sizeof(Datum) * vector->size); + entries = (Datum *) palloc(sizeof(Datum) * tscount); - for (i = 0; i < vector->size; i++) + INITPOS(pos); + for (i = 0; i < tscount; i++) { text *txt; - txt = cstring_to_text_with_len(STRPTR(vector) + we->pos, we->len); + txt = cstring_to_text_with_len(STRPTR(vector) + pos, + ENTRY_LEN(vector, we)); entries[i] = PointerGetDatum(txt); - - we++; + INCRPTR(vector, we, pos); } } diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 7ce2699b5c..18d3de3725 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -192,28 +192,33 @@ gtsvector_compress(PG_FUNCTION_ARGS) int32 *arr; WordEntry *ptr = ARRPTR(val); char *words = STRPTR(val); + const int tscount = TS_COUNT(val); + uint32 pos; - len = CALCGTSIZE(ARRKEY, val->size); + len = CALCGTSIZE(ARRKEY, tscount); res = (SignTSVector *) palloc(len); SET_VARSIZE(res, len); res->flag = ARRKEY; arr = GETARR(res); - len = val->size; + len = tscount; + + INITPOS(pos); while (len--) { pg_crc32 c; INIT_LEGACY_CRC32(c); - COMP_LEGACY_CRC32(c, words + ptr->pos, ptr->len); + COMP_LEGACY_CRC32(c, words + pos, ENTRY_LEN(val, ptr)); FIN_LEGACY_CRC32(c); *arr = *(int32 *) &c; arr++; - ptr++; + + INCRPTR(val, ptr, pos); } - len = uniqueint(GETARR(res), val->size); - if (len != val->size) + len = uniqueint(GETARR(res), tscount); + if (len != tscount) { /* * there is a collision of hash-function; len is always less than diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 4577bcc0b8..cb859d9b47 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -53,43 +53,39 @@ word_distance(int32 w) static int cnt_length(TSVector t) { - WordEntry *ptr = ARRPTR(t), - *end = (WordEntry *) STRPTR(t); - int len = 0; + int i, + len = 0; - while (ptr < end) + for (i = 0; i < TS_COUNT(t); i++) { - int clen = POSDATALEN(t, ptr); - - if (clen == 0) - len += 1; - else - len += clen; + WordEntry *entry = UNWRAP_ENTRY(t, ARRPTR(t) + i); - ptr++; + Assert(!entry->hasoff); + len += (entry->npos == 0) ? 1 : entry->npos; } return len; } -#define WordECompareQueryItem(e,q,p,i,m) \ - tsCompareString((q) + (i)->distance, (i)->length, \ - (e) + (p)->pos, (p)->len, (m)) - - /* * Returns a pointer to a WordEntry's array corresponding to 'item' from * tsvector 't'. 'q' is the TSQuery containing 'item'. * Returns NULL if not found. */ -static WordEntry * +static int find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) { - WordEntry *StopLow = ARRPTR(t); - WordEntry *StopHigh = (WordEntry *) STRPTR(t); - WordEntry *StopMiddle = StopHigh; +#define WordECompareQueryItem(s,l,q,i,m) \ + tsCompareString((q) + (i)->distance, (i)->length, \ + s, l, (m)) + + int StopLow = 0; + int StopHigh = TS_COUNT(t); + int StopMiddle = StopHigh; int difference; + char *lexeme; + WordEntry *we; *nitem = 0; @@ -97,7 +93,12 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; - difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false); + lexeme = tsvector_getlexeme(t, StopMiddle, &we); + + Assert(!we->hasoff); + difference = WordECompareQueryItem(lexeme, we->len, + GETOPERAND(q), item, false); + if (difference == 0) { StopHigh = StopMiddle; @@ -117,18 +118,22 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) *nitem = 0; - while (StopMiddle < (WordEntry *) STRPTR(t) && - WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0) + while (StopMiddle < TS_COUNT(t)) { + lexeme = tsvector_getlexeme(t, StopMiddle, &we); + + Assert(!we->hasoff); + if (WordECompareQueryItem(lexeme, we->len, GETOPERAND(q), item, true) != 0) + break; + (*nitem)++; StopMiddle++; } } - return (*nitem > 0) ? StopHigh : NULL; + return (*nitem > 0) ? StopHigh : -1; } - /* * sort QueryOperands by (length, word) */ @@ -200,15 +205,13 @@ SortAndUniqItems(TSQuery q, int *size) static float calc_rank_and(const float *w, TSVector t, TSQuery q) { - WordEntryPosVector **pos; - WordEntryPosVector1 posnull; - WordEntryPosVector *POSNULL; + WordEntryPos **pos; + uint16 *npos; + WordEntryPos posnull[1] = {0}; int i, k, l, p; - WordEntry *entry, - *firstentry; WordEntryPos *post, *ct; int32 dimt, @@ -225,41 +228,55 @@ calc_rank_and(const float *w, TSVector t, TSQuery q) pfree(item); return calc_rank_or(w, t, q); } - pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); + pos = (WordEntryPos **) palloc0(sizeof(WordEntryPos *) * q->size); + npos = (uint16 *) palloc0(sizeof(uint16) * q->size); - /* A dummy WordEntryPos array to use when haspos is false */ - posnull.npos = 1; - posnull.pos[0] = 0; - WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1); - POSNULL = (WordEntryPosVector *) &posnull; + /* posnull is a dummy WordEntryPos array to use when npos == 0 */ + WEP_SETPOS(posnull[0], MAXENTRYPOS - 1); for (i = 0; i < size; i++) { - firstentry = entry = find_wordentry(t, q, item[i], &nitem); - if (!entry) + int idx = find_wordentry(t, q, item[i], &nitem), + firstidx; + + if (idx == -1) continue; - while (entry - firstentry < nitem) + firstidx = idx; + + while (idx - firstidx < nitem) { - if (entry->haspos) - pos[i] = _POSVECPTR(t, entry); + WordEntry *entry; + + char *lexeme = tsvector_getlexeme(t, idx, &entry); + + Assert(!entry->hasoff); + if (entry->npos) + { + pos[i] = POSDATAPTR(lexeme, entry->len); + npos[i] = entry->npos; + } else - pos[i] = POSNULL; + { + pos[i] = posnull; + npos[i] = 1; + } + + post = pos[i]; + dimt = npos[i]; - dimt = pos[i]->npos; - post = pos[i]->pos; for (k = 0; k < i; k++) { if (!pos[k]) continue; - lenct = pos[k]->npos; - ct = pos[k]->pos; + lenct = npos[k]; + ct = pos[k]; for (l = 0; l < dimt; l++) { for (p = 0; p < lenct; p++) { dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); - if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL))) + if (dist || (dist == 0 && (pos[i] == posnull || pos[k] == posnull))) { float curw; @@ -272,10 +289,11 @@ calc_rank_and(const float *w, TSVector t, TSQuery q) } } - entry++; + idx++; } } pfree(pos); + pfree(npos); pfree(item); return res; } @@ -283,9 +301,8 @@ calc_rank_and(const float *w, TSVector t, TSQuery q) static float calc_rank_or(const float *w, TSVector t, TSQuery q) { - WordEntry *entry, - *firstentry; - WordEntryPosVector1 posnull; + /* A dummy WordEntryPos array to use when lexeme hasn't positions */ + WordEntryPos posnull[1] = {0}; WordEntryPos *post; int32 dimt, j, @@ -295,33 +312,37 @@ calc_rank_or(const float *w, TSVector t, TSQuery q) QueryOperand **item; int size = q->size; - /* A dummy WordEntryPos array to use when haspos is false */ - posnull.npos = 1; - posnull.pos[0] = 0; - item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) { + int idx, + firstidx; float resj, wjm; int32 jm; - firstentry = entry = find_wordentry(t, q, item[i], &nitem); - if (!entry) + idx = find_wordentry(t, q, item[i], &nitem); + if (idx == -1) continue; - while (entry - firstentry < nitem) + firstidx = idx; + + while (idx - firstidx < nitem) { - if (entry->haspos) + WordEntry *entry; + char *lexeme = tsvector_getlexeme(t, idx, &entry); + + Assert(!entry->hasoff); + if (entry->npos) { - dimt = POSDATALEN(t, entry); - post = POSDATAPTR(t, entry); + dimt = entry->npos; + post = POSDATAPTR(lexeme, entry->len); } else { - dimt = posnull.npos; - post = posnull.pos; + dimt = 1; + post = posnull; } resj = 0.0; @@ -345,7 +366,7 @@ calc_rank_or(const float *w, TSVector t, TSQuery q) */ res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; - entry++; + idx++; } } if (size > 0) @@ -361,7 +382,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method) float res = 0.0; int len; - if (!t->size || !q->size) + if (!TS_COUNT(t) || !q->size) return 0.0; /* XXX: What about NOT? */ @@ -373,7 +394,7 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method) if (res < 0) res = 1e-20f; - if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) + if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(t) > 0) res /= log((double) (cnt_length(t) + 1)) / log(2.0); if (method & RANK_NORM_LENGTH) @@ -385,11 +406,11 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method) /* RANK_NORM_EXTDIST not applicable */ - if ((method & RANK_NORM_UNIQ) && t->size > 0) - res /= (float) (t->size); + if ((method & RANK_NORM_UNIQ) && TS_COUNT(t) > 0) + res /= (float) (TS_COUNT(t)); - if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) - res /= log((double) (t->size + 1)) / log(2.0); + if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(t) > 0) + res /= log((double) (TS_COUNT(t) + 1)) / log(2.0); if (method & RANK_NORM_RDIVRPLUS1) res /= (res + 1); @@ -504,13 +525,13 @@ typedef struct struct { /* compiled doc representation */ QueryItem **items; - int16 nitem; + int32 nitem; } query; struct { /* struct is used for preparing doc * representation */ QueryItem *item; - WordEntry *entry; + int32 idx; } map; } data; WordEntryPos pos; @@ -526,10 +547,10 @@ compareDocR(const void *va, const void *vb) { if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) { - if (a->data.map.entry == b->data.map.entry) + if (a->data.map.idx == b->data.map.idx) return 0; - return (a->data.map.entry > b->data.map.entry) ? 1 : -1; + return (a->data.map.idx > b->data.map.idx) ? 1 : -1; } return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; @@ -724,9 +745,6 @@ static DocRepresentation * get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) { QueryItem *item = GETQUERY(qr->query); - WordEntry *entry, - *firstentry; - WordEntryPos *post; int32 dimt, /* number of 'post' items */ j, i, @@ -743,29 +761,38 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) */ for (i = 0; i < qr->query->size; i++) { + int idx, + firstidx; QueryOperand *curoperand; + WordEntryPos *post; if (item[i].type != QI_VAL) continue; curoperand = &item[i].qoperand; - firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); - if (!entry) + idx = find_wordentry(txt, qr->query, curoperand, &nitem); + if (idx < 0) continue; + firstidx = idx; + /* iterations over entries in tsvector */ - while (entry - firstentry < nitem) + while (idx - firstidx < nitem) { - if (entry->haspos) + WordEntry *entry; + char *lex = tsvector_getlexeme(txt, idx, &entry); + + Assert(!entry->hasoff); + if (entry->npos) { - dimt = POSDATALEN(txt, entry); - post = POSDATAPTR(txt, entry); + dimt = entry->npos; + post = POSDATAPTR(lex, entry->len); } else { /* ignore words without positions */ - entry++; + idx++; continue; } @@ -782,13 +809,12 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) { doc[cur].pos = post[j]; - doc[cur].data.map.entry = entry; + doc[cur].data.map.idx = idx; doc[cur].data.map.item = (QueryItem *) curoperand; cur++; } } - - entry++; + idx++; } } @@ -814,7 +840,7 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) while (rptr - doc < cur) { if (rptr->pos == (rptr - 1)->pos && - rptr->data.map.entry == (rptr - 1)->data.map.entry) + rptr->data.map.idx == (rptr - 1)->data.map.idx) { storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; storage.data.query.nitem++; @@ -917,7 +943,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) NExtent++; } - if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) + if ((method & RANK_NORM_LOGLENGTH) && TS_COUNT(txt) > 0) Wdoc /= log((double) (cnt_length(txt) + 1)); if (method & RANK_NORM_LENGTH) @@ -930,11 +956,11 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0) Wdoc /= ((double) NExtent) / SumDist; - if ((method & RANK_NORM_UNIQ) && txt->size > 0) - Wdoc /= (double) (txt->size); + if ((method & RANK_NORM_UNIQ) && TS_COUNT(txt) > 0) + Wdoc /= (double) (TS_COUNT(txt)); - if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) - Wdoc /= log((double) (txt->size + 1)) / log(2.0); + if ((method & RANK_NORM_LOGUNIQ) && TS_COUNT(txt) > 0) + Wdoc /= log((double) (TS_COUNT(txt) + 1)) / log(2.0); if (method & RANK_NORM_RDIVRPLUS1) Wdoc /= (Wdoc + 1); diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c index 6f66c1f58c..de34df0c3d 100644 --- a/src/backend/utils/adt/tsvector.c +++ b/src/backend/utils/adt/tsvector.c @@ -23,8 +23,8 @@ typedef struct { WordEntry entry; /* must be first! */ + size_t offset; /* offset of lexeme in some buffer */ WordEntryPos *pos; - int poslen; /* number of elements in pos */ } WordEntryIN; @@ -79,14 +79,30 @@ uniquePos(WordEntryPos *a, int l) /* Compare two WordEntryIN values for qsort */ static int -compareentry(const void *va, const void *vb, void *arg) +compareentry_in(const void *va, const void *vb, void *arg) { const WordEntryIN *a = (const WordEntryIN *) va; const WordEntryIN *b = (const WordEntryIN *) vb; char *BufferStr = (char *) arg; - return tsCompareString(&BufferStr[a->entry.pos], a->entry.len, - &BufferStr[b->entry.pos], b->entry.len, + return tsCompareString(&BufferStr[a->offset], a->entry.len, + &BufferStr[b->offset], b->entry.len, + false); +} + +/* Compare two WordEntry values for qsort */ +static int +compareentry(const void *va, const void *vb, void *arg) +{ + const WordEntry *a = (const WordEntry *) va; + const WordEntry *b = (const WordEntry *) vb; + TSVector tsv = (TSVector) arg; + + uint32 offset1 = tsvector_getoffset(tsv, a - ARRPTR(tsv), NULL), + offset2 = tsvector_getoffset(tsv, b - ARRPTR(tsv), NULL); + + return tsCompareString(STRPTR(tsv) + offset1, ENTRY_LEN(tsv, a), + STRPTR(tsv) + offset2, ENTRY_LEN(tsv, b), false); } @@ -97,14 +113,15 @@ compareentry(const void *va, const void *vb, void *arg) static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) { - int buflen; + int buflen, + i = 0; WordEntryIN *ptr, *res; Assert(l >= 1); if (l > 1) - qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, + qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry_in, (void *) buf); buflen = 0; @@ -112,67 +129,76 @@ uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) ptr = a + 1; while (ptr - a < l) { + Assert(!ptr->entry.hasoff); + if (!(ptr->entry.len == res->entry.len && - strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], - res->entry.len) == 0)) + strncmp(&buf[ptr->offset], &buf[res->offset], res->entry.len) == 0)) { /* done accumulating data into *res, count space needed */ + buflen = SHORTALIGN(buflen); + if (i++ % TS_OFFSET_STRIDE == 0) + { + buflen = INTALIGN(buflen); + buflen += sizeof(WordEntry); + } + buflen += res->entry.len; - if (res->entry.haspos) + if (res->entry.npos) { - res->poslen = uniquePos(res->pos, res->poslen); + res->entry.npos = uniquePos(res->pos, res->entry.npos); buflen = SHORTALIGN(buflen); - buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); + buflen += res->entry.npos * sizeof(WordEntryPos); } res++; if (res != ptr) - memcpy(res, ptr, sizeof(WordEntryIN)); + *res = *ptr; } - else if (ptr->entry.haspos) + else if (ptr->entry.npos) { - if (res->entry.haspos) + if (res->entry.npos) { /* append ptr's positions to res's positions */ - int newlen = ptr->poslen + res->poslen; + int newlen = ptr->entry.npos + res->entry.npos; res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); - memcpy(&res->pos[res->poslen], ptr->pos, - ptr->poslen * sizeof(WordEntryPos)); - res->poslen = newlen; + memcpy(&res->pos[res->entry.npos], ptr->pos, + ptr->entry.npos * sizeof(WordEntryPos)); + res->entry.npos = newlen; pfree(ptr->pos); } else { /* just give ptr's positions to pos */ - res->entry.haspos = 1; + res->entry.npos = ptr->entry.npos; res->pos = ptr->pos; - res->poslen = ptr->poslen; } } ptr++; } /* count space needed for last item */ + if (i % TS_OFFSET_STRIDE == 0) + { + buflen = INTALIGN(buflen); + buflen += sizeof(WordEntry); + } + else + buflen = SHORTALIGN(buflen); + buflen += res->entry.len; - if (res->entry.haspos) + + if (res->entry.npos) { - res->poslen = uniquePos(res->pos, res->poslen); + res->entry.npos = uniquePos(res->pos, res->entry.npos); buflen = SHORTALIGN(buflen); - buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); + buflen += res->entry.npos * sizeof(WordEntryPos); } *outbuflen = buflen; return res + 1 - a; } -static int -WordEntryCMP(WordEntry *a, WordEntry *b, char *buf) -{ - return compareentry(a, b, buf); -} - - Datum tsvectorin(PG_FUNCTION_ARGS) { @@ -181,7 +207,6 @@ tsvectorin(PG_FUNCTION_ARGS) WordEntryIN *arr; int totallen; int arrlen; /* allocated size of arr */ - WordEntry *inarr; int len = 0; TSVector in; int i; @@ -189,7 +214,6 @@ tsvectorin(PG_FUNCTION_ARGS) int toklen; WordEntryPos *pos; int poslen; - char *strbuf; int stroff; /* @@ -238,23 +262,13 @@ tsvectorin(PG_FUNCTION_ARGS) tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); cur = tmpbuf + dist; } + arr[len].entry.hasoff = 0; arr[len].entry.len = toklen; - arr[len].entry.pos = cur - tmpbuf; + arr[len].offset = cur - tmpbuf; + arr[len].entry.npos = poslen; + arr[len].pos = (poslen != 0) ? pos : NULL; memcpy((void *) cur, (void *) token, toklen); cur += toklen; - - if (poslen != 0) - { - arr[len].entry.haspos = 1; - arr[len].pos = pos; - arr[len].poslen = poslen; - } - else - { - arr[len].entry.haspos = 0; - arr[len].pos = NULL; - arr[len].poslen = 0; - } len++; } @@ -273,36 +287,18 @@ tsvectorin(PG_FUNCTION_ARGS) totallen = CALCDATASIZE(len, buflen); in = (TSVector) palloc0(totallen); SET_VARSIZE(in, totallen); - in->size = len; - inarr = ARRPTR(in); - strbuf = STRPTR(in); + TS_SETCOUNT(in, len); stroff = 0; for (i = 0; i < len; i++) { - memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); - arr[i].entry.pos = stroff; - stroff += arr[i].entry.len; - if (arr[i].entry.haspos) - { - if (arr[i].poslen > 0xFFFF) - elog(ERROR, "positions array too long"); - - /* Copy number of positions */ - stroff = SHORTALIGN(stroff); - *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; - stroff += sizeof(uint16); - - /* Copy positions */ - memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); - stroff += arr[i].poslen * sizeof(WordEntryPos); + tsvector_addlexeme(in, i, &stroff, &tmpbuf[arr[i].offset], + arr[i].entry.len, arr[i].pos, arr[i].entry.npos); + if (arr[i].entry.npos) pfree(arr[i].pos); - } - inarr[i] = arr[i].entry; } - Assert((strbuf + stroff - (char *) in) == totallen); - + Assert((STRPTR(in) + stroff - (char *) in) == totallen); PG_RETURN_TSVECTOR(in); } @@ -313,28 +309,37 @@ tsvectorout(PG_FUNCTION_ARGS) char *outbuf; int32 i, lenbuf = 0, - pp; + pp, + tscount = TS_COUNT(out); + uint32 pos; WordEntry *ptr = ARRPTR(out); char *curbegin, *curin, *curout; - lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; - for (i = 0; i < out->size; i++) + lenbuf = tscount * 2 /* '' */ + tscount - 1 /* space */ + 2 /* \0 */ ; + for (i = 0; i < tscount; i++) { - lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; - if (ptr[i].haspos) - lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); + int npos = ENTRY_NPOS(out, ptr + i); + + lenbuf += ENTRY_LEN(out, ptr + i) * 2 * pg_database_encoding_max_length() /* for escape */ ; + if (npos) + lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * npos; } curout = outbuf = (char *) palloc(lenbuf); - for (i = 0; i < out->size; i++) + + INITPOS(pos); + for (i = 0; i < tscount; i++) { - curbegin = curin = STRPTR(out) + ptr->pos; + int lex_len = ENTRY_LEN(out, ptr), + npos = ENTRY_NPOS(out, ptr); + + curbegin = curin = STRPTR(out) + pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; - while (curin - curbegin < ptr->len) + while (curin - curbegin < lex_len) { int len = pg_mblen(curin); @@ -348,12 +353,12 @@ tsvectorout(PG_FUNCTION_ARGS) } *curout++ = '\''; - if ((pp = POSDATALEN(out, ptr)) != 0) + if ((pp = npos) != 0) { WordEntryPos *wptr; *curout++ = ':'; - wptr = POSDATAPTR(out, ptr); + wptr = POSDATAPTR(curbegin, lex_len); while (pp) { curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); @@ -379,7 +384,8 @@ tsvectorout(PG_FUNCTION_ARGS) wptr++; } } - ptr++; + + INCRPTR(out, ptr, pos); } *curout = '\0'; @@ -406,35 +412,38 @@ tsvectorsend(PG_FUNCTION_ARGS) StringInfoData buf; int i, j; + uint32 pos; WordEntry *weptr = ARRPTR(vec); pq_begintypsend(&buf); + pq_sendint(&buf, TS_COUNT(vec), sizeof(int32)); - pq_sendint(&buf, vec->size, sizeof(int32)); - for (i = 0; i < vec->size; i++) + INITPOS(pos); + for (i = 0; i < TS_COUNT(vec); i++) { - uint16 npos; + char *lexeme = STRPTR(vec) + pos; + int npos = ENTRY_NPOS(vec, weptr), + lex_len = ENTRY_LEN(vec, weptr); /* * the strings in the TSVector array are not null-terminated, so we * have to send the null-terminator separately */ - pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); + pq_sendtext(&buf, lexeme, lex_len); pq_sendbyte(&buf, '\0'); - - npos = POSDATALEN(vec, weptr); pq_sendint(&buf, npos, sizeof(uint16)); if (npos > 0) { - WordEntryPos *wepptr = POSDATAPTR(vec, weptr); + WordEntryPos *wepptr = POSDATAPTR(lexeme, lex_len); for (j = 0; j < npos; j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } - weptr++; + INCRPTR(vec, weptr, pos); } + PG_FREE_IF_COPY(vec, 0); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -443,14 +452,16 @@ tsvectorrecv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; - int i; - int32 nentries; - int datalen; /* number of bytes used in the variable size + int i, + datalen; /* number of bytes used in the variable size * area after fixed size TSVector header and * WordEntries */ + int32 nentries; Size hdrlen; Size len; /* allocated size of vec */ bool needSort = false; + char *prev_lexeme = NULL; + int prev_lex_len; nentries = pq_getmsgint(buf, sizeof(int32)); if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) @@ -460,16 +471,17 @@ tsvectorrecv(PG_FUNCTION_ARGS) len = hdrlen * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); - vec->size = nentries; + TS_SETCOUNT(vec, nentries); datalen = 0; for (i = 0; i < nentries; i++) { - const char *lexeme; + char *lexeme, + *lexeme_out; uint16 npos; - size_t lex_len; + int lex_len; - lexeme = pq_getmsgstring(buf); + lexeme = (char *) pq_getmsgstring(buf); npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); /* sanity checks */ @@ -489,62 +501,42 @@ tsvectorrecv(PG_FUNCTION_ARGS) * * But make sure the buffer is large enough first. */ - while (hdrlen + SHORTALIGN(datalen + lex_len) + - (npos + 1) * sizeof(WordEntryPos) >= len) + while (hdrlen + SHORTALIGN(datalen + lex_len) + sizeof(WordEntry) + + npos * sizeof(WordEntryPos) >= len) { len *= 2; vec = (TSVector) repalloc(vec, len); } - vec->entries[i].haspos = (npos > 0) ? 1 : 0; - vec->entries[i].len = lex_len; - vec->entries[i].pos = datalen; - - memcpy(STRPTR(vec) + datalen, lexeme, lex_len); - - datalen += lex_len; - - if (i > 0 && WordEntryCMP(&vec->entries[i], - &vec->entries[i - 1], - STRPTR(vec)) <= 0) + if (prev_lexeme && tsCompareString(lexeme, lex_len, + prev_lexeme, prev_lex_len, false) <= 0) needSort = true; - /* Receive positions */ + lexeme_out = tsvector_addlexeme(vec, i, &datalen, lexeme, + lex_len, NULL, npos); if (npos > 0) { - uint16 j; WordEntryPos *wepptr; + int j; - /* - * Pad to 2-byte alignment if necessary. Though we used palloc0 - * for the initial allocation, subsequent repalloc'd memory areas - * are not initialized to zero. - */ - if (datalen != SHORTALIGN(datalen)) - { - *(STRPTR(vec) + datalen) = '\0'; - datalen = SHORTALIGN(datalen); - } - - memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); - - wepptr = POSDATAPTR(vec, &vec->entries[i]); + wepptr = POSDATAPTR(lexeme_out, lex_len); for (j = 0; j < npos; j++) { wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is misordered"); } - - datalen += (npos + 1) * sizeof(WordEntry); } + + prev_lexeme = lexeme; + prev_lex_len = lex_len; } SET_VARSIZE(vec, hdrlen + datalen); if (needSort) - qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry), - compareentry, (void *) STRPTR(vec)); + qsort_arg((void *) ARRPTR(vec), TS_COUNT(vec), sizeof(WordEntry), + compareentry, (void *) vec); PG_RETURN_TSVECTOR(vec); } diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 822520299e..02e80c4a74 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -33,9 +33,9 @@ typedef struct { - WordEntry *arrb; - WordEntry *arre; - char *values; + TSVector vec; + int bidx; + int eidx; char *operand; } CHKVAL; @@ -71,7 +71,7 @@ static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column); static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len); /* - * Order: haspos, len, word, for all positions (pos, weight) + * Order: npos, len, word, for all positions (pos, weight) */ static int silly_cmp_tsvector(const TSVector a, const TSVector b) @@ -80,9 +80,9 @@ silly_cmp_tsvector(const TSVector a, const TSVector b) return -1; else if (VARSIZE(a) > VARSIZE(b)) return 1; - else if (a->size < b->size) + else if (TS_COUNT(a) < TS_COUNT(b)) return -1; - else if (a->size > b->size) + else if (TS_COUNT(a) > TS_COUNT(b)) return 1; else { @@ -90,28 +90,40 @@ silly_cmp_tsvector(const TSVector a, const TSVector b) WordEntry *bptr = ARRPTR(b); int i = 0; int res; + uint32 pos1, + pos2; + INITPOS(pos1); + INITPOS(pos2); - for (i = 0; i < a->size; i++) + for (i = 0; i < TS_COUNT(a); i++) { - if (aptr->haspos != bptr->haspos) - { - return (aptr->haspos > bptr->haspos) ? -1 : 1; - } - else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0) + char *lex1 = STRPTR(a) + pos1, + *lex2 = STRPTR(b) + pos2; + int npos1 = ENTRY_NPOS(a, aptr), + npos2 = ENTRY_NPOS(b, bptr); + int len1 = ENTRY_LEN(a, aptr), + len2 = ENTRY_LEN(b, bptr); + + if ((npos1 == 0 || npos2 == 0) && npos1 != npos2) + return npos1 > npos2 ? -1 : 1; + else if ((res = tsCompareString(lex1, len1, lex2, len2, false)) != 0) { return res; } - else if (aptr->haspos) + else if (npos1 > 0) { - WordEntryPos *ap = POSDATAPTR(a, aptr); - WordEntryPos *bp = POSDATAPTR(b, bptr); + WordEntryPos *ap, + *bp; int j; - if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) - return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; + ap = POSDATAPTR(lex1, len1); + bp = POSDATAPTR(lex2, len2); + + if (npos1 != npos2) + return (npos1 > npos2) ? -1 : 1; - for (j = 0; j < POSDATALEN(a, aptr); j++) + for (j = 0; j < npos1; j++) { if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) { @@ -125,8 +137,8 @@ silly_cmp_tsvector(const TSVector a, const TSVector b) } } - aptr++; - bptr++; + INCRPTR(a, aptr, pos1); + INCRPTR(b, bptr, pos2); } } @@ -161,27 +173,29 @@ tsvector_strip(PG_FUNCTION_ARGS) TSVector in = PG_GETARG_TSVECTOR(0); TSVector out; int i, + count, + posout = 0, + pos, len = 0; - WordEntry *arrin = ARRPTR(in), - *arrout; - char *cur; + WordEntry *entryin = ARRPTR(in); - for (i = 0; i < in->size; i++) - len += arrin[i].len; + count = TS_COUNT(in); + for (i = 0; i < count; i++) + INCRSIZE(len, i, ENTRY_LEN(in, ARRPTR(in) + i), 0); - len = CALCDATASIZE(in->size, len); + len = CALCDATASIZE(count, len); out = (TSVector) palloc0(len); SET_VARSIZE(out, len); - out->size = in->size; - arrout = ARRPTR(out); - cur = STRPTR(out); - for (i = 0; i < in->size; i++) + TS_SETCOUNT(out, count); + + INITPOS(pos); + for (i = 0; i < count; i++) { - memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len); - arrout[i].haspos = 0; - arrout[i].len = arrin[i].len; - arrout[i].pos = cur - STRPTR(out); - cur += arrout[i].len; + tsvector_addlexeme(out, i, &posout, + STRPTR(in) + pos, ENTRY_LEN(in, entryin), + NULL, 0); + + INCRPTR(in, entryin, pos); } PG_FREE_IF_COPY(in, 0); @@ -192,7 +206,7 @@ Datum tsvector_length(PG_FUNCTION_ARGS) { TSVector in = PG_GETARG_TSVECTOR(0); - int32 ret = in->size; + int32 ret = TS_COUNT(in); PG_FREE_IF_COPY(in, 0); PG_RETURN_INT32(ret); @@ -204,11 +218,10 @@ tsvector_setweight(PG_FUNCTION_ARGS) TSVector in = PG_GETARG_TSVECTOR(0); char cw = PG_GETARG_CHAR(1); TSVector out; - int i, - j; - WordEntry *entry; - WordEntryPos *p; + int i; + WordEntry *weptr; int w = 0; + uint32 pos; switch (cw) { @@ -235,20 +248,22 @@ tsvector_setweight(PG_FUNCTION_ARGS) out = (TSVector) palloc(VARSIZE(in)); memcpy(out, in, VARSIZE(in)); - entry = ARRPTR(out); - i = out->size; - while (i--) + weptr = ARRPTR(out); + + INITPOS(pos); + for (i = 0; i < TS_COUNT(out); i++) { - if ((j = POSDATALEN(out, entry)) != 0) + int j, + npos = ENTRY_NPOS(out, weptr); + + if (npos) { - p = POSDATAPTR(out, entry); - while (j--) - { - WEP_SETWEIGHT(*p, w); - p++; - } + WordEntryPos *p = POSDATAPTR(STRPTR(out) + pos, ENTRY_LEN(out, weptr)); + + for (j = 0; j < npos; j++) + WEP_SETWEIGHT(p[j], w); } - entry++; + INCRPTR(out, weptr, pos); } PG_FREE_IF_COPY(in, 0); @@ -269,10 +284,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) TSVector tsout; int i, - j, nlexemes, weight; - WordEntry *entry; Datum *dlexemes; bool *nulls; @@ -301,8 +314,6 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) tsout = (TSVector) palloc(VARSIZE(tsin)); memcpy(tsout, tsin, VARSIZE(tsin)); - entry = ARRPTR(tsout); - deconstruct_array(lexemes, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nlexemes); @@ -315,7 +326,8 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) { char *lex; int lex_len, - lex_pos; + lex_idx, + npos; if (nulls[i]) ereport(ERROR, @@ -324,17 +336,19 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; - lex_pos = tsvector_bsearch(tsout, lex, lex_len); + lex_idx = tsvector_bsearch(tsin, lex, lex_len); + npos = ENTRY_NPOS(tsin, ARRPTR(tsout) + lex_idx); - if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0) + if (lex_idx >= 0 && npos > 0) { - WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos); + int j; + WordEntry *we; + char *lexeme = tsvector_getlexeme(tsout, lex_idx, &we); - while (j--) - { - WEP_SETWEIGHT(*p, weight); - p++; - } + WordEntryPos *p = POSDATAPTR(lexeme, we->len); + + for (j = 0; j < npos; j++) + WEP_SETWEIGHT(p[j], weight); } } @@ -354,34 +368,27 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) * Return the number added (might be less than expected due to overflow) */ static int32 -add_pos(TSVector src, WordEntry *srcptr, - TSVector dest, WordEntry *destptr, +add_pos(char *src, WordEntry *srcptr, + WordEntryPos *dest, int from, int32 maxpos) { - uint16 *clen = &_POSVECPTR(dest, destptr)->npos; + uint16 clen = from; int i; - uint16 slen = POSDATALEN(src, srcptr), - startlen; - WordEntryPos *spos = POSDATAPTR(src, srcptr), - *dpos = POSDATAPTR(dest, destptr); - - if (!destptr->haspos) - *clen = 0; + uint16 slen = srcptr->npos; + WordEntryPos *spos = POSDATAPTR(src, srcptr->len); - startlen = *clen; + Assert(!srcptr->hasoff); for (i = 0; - i < slen && *clen < MAXNUMPOS && - (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); + i < slen && clen < MAXNUMPOS && + (clen == 0 || WEP_GETPOS(dest[clen - 1]) != MAXENTRYPOS - 1); i++) { - WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); - WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); - (*clen)++; + WEP_SETWEIGHT(dest[clen], WEP_GETWEIGHT(spos[i])); + WEP_SETPOS(dest[clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); + clen++; } - if (*clen != startlen) - destptr->haspos = 1; - return *clen - startlen; + return clen - from; } /* @@ -392,20 +399,20 @@ add_pos(TSVector src, WordEntry *srcptr, static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len) { - WordEntry *arrin = ARRPTR(tsv); int StopLow = 0, - StopHigh = tsv->size, + StopHigh = TS_COUNT(tsv), StopMiddle, cmp; while (StopLow < StopHigh) { - StopMiddle = (StopLow + StopHigh) / 2; + WordEntry *entry = NULL; + char *str; + StopMiddle = (StopLow + StopHigh) / 2; + str = tsvector_getlexeme(tsv, StopMiddle, &entry); cmp = tsCompareString(lexeme, lexeme_len, - STRPTR(tsv) + arrin[StopMiddle].pos, - arrin[StopMiddle].len, - false); + str, entry->len, false); if (cmp < 0) StopHigh = StopMiddle; @@ -460,14 +467,12 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, int indices_count) { TSVector tsout; - WordEntry *arrin = ARRPTR(tsv), - *arrout; - char *data = STRPTR(tsv), - *dataout; - int i, /* index in arrin */ - j, /* index in arrout */ + WordEntry *ptr = ARRPTR(tsv); + int i, /* index in input tsvector */ + j, /* index in output tsvector */ k, /* index in indices_to_delete */ - curoff; /* index in dataout area */ + curoff = 0, /* index in data area of output */ + pos; /* * Sort the filter array to simplify membership checks below. Also, get @@ -495,16 +500,18 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, tsout = (TSVector) palloc0(VARSIZE(tsv)); /* This count must be correct because STRPTR(tsout) relies on it. */ - tsout->size = tsv->size - indices_count; + TS_SETCOUNT(tsout, TS_COUNT(tsv) - indices_count); /* * Copy tsv to tsout, skipping lexemes listed in indices_to_delete. */ - arrout = ARRPTR(tsout); - dataout = STRPTR(tsout); - curoff = 0; - for (i = j = k = 0; i < tsv->size; i++) + + INITPOS(pos); + for (i = j = k = 0; i < TS_COUNT(tsv); i++) { + char *lex = STRPTR(tsv) + pos; + int lex_len = ENTRY_LEN(tsv, ptr); + /* * If current i is present in indices_to_delete, skip this lexeme. * Since indices_to_delete is already sorted, we only need to check @@ -513,28 +520,14 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, if (k < indices_count && i == indices_to_delete[k]) { k++; - continue; + goto next; } - /* Copy lexeme and its positions and weights */ - memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len); - arrout[j].haspos = arrin[i].haspos; - arrout[j].len = arrin[i].len; - arrout[j].pos = curoff; - curoff += arrin[i].len; - if (arrin[i].haspos) - { - int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) - + sizeof(uint16); - - curoff = SHORTALIGN(curoff); - memcpy(dataout + curoff, - STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len), - len); - curoff += len; - } + tsvector_addlexeme(tsout, j++, &curoff, lex, lex_len, + POSDATAPTR(lex, lex_len), ENTRY_NPOS(tsv, ptr)); - j++; +next: + INCRPTR(tsv, ptr, pos); } /* @@ -543,8 +536,7 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete, * estimation of tsout's size is wrong. */ Assert(k == indices_count); - - SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); + SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), curoff)); return tsout; } @@ -637,6 +629,7 @@ tsvector_unnest(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; TSVector tsin; + uint32 pos; if (SRF_IS_FIRSTCALL()) { @@ -655,31 +648,33 @@ tsvector_unnest(PG_FUNCTION_ARGS) TEXTARRAYOID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); - funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0); + INITPOS(pos); + funcctx->user_fctx = list_make2(PG_GETARG_TSVECTOR(0), makeInteger(pos)); MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); - tsin = (TSVector) funcctx->user_fctx; + tsin = (TSVector) linitial(funcctx->user_fctx); + pos = intVal(lsecond(funcctx->user_fctx)); - if (funcctx->call_cntr < tsin->size) + if (funcctx->call_cntr < TS_COUNT(tsin)) { - WordEntry *arrin = ARRPTR(tsin); + WordEntry *entry = ARRPTR(tsin) + funcctx->call_cntr; char *data = STRPTR(tsin); HeapTuple tuple; int j, - i = funcctx->call_cntr; + npos = ENTRY_NPOS(tsin, entry), + lex_len = ENTRY_LEN(tsin, entry); bool nulls[] = {false, false, false}; Datum values[3]; values[0] = PointerGetDatum( - cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len) - ); + cstring_to_text_with_len(data + pos, lex_len)); - if (arrin[i].haspos) + if (npos) { - WordEntryPosVector *posv; + WordEntryPos *apos = POSDATAPTR(data + pos, lex_len); Datum *positions; Datum *weights; char weight; @@ -689,28 +684,28 @@ tsvector_unnest(PG_FUNCTION_ARGS) * uint16 (2 bits for weight, 14 for position). Here we extract * that in two separate arrays. */ - posv = _POSVECPTR(tsin, arrin + i); - positions = palloc(posv->npos * sizeof(Datum)); - weights = palloc(posv->npos * sizeof(Datum)); - for (j = 0; j < posv->npos; j++) + positions = palloc(npos * sizeof(Datum)); + weights = palloc(npos * sizeof(Datum)); + for (j = 0; j < npos; j++) { - positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j])); - weight = 'D' - WEP_GETWEIGHT(posv->pos[j]); + positions[j] = Int16GetDatum(WEP_GETPOS(apos[j])); + weight = 'D' - WEP_GETWEIGHT(apos[j]); weights[j] = PointerGetDatum( cstring_to_text_with_len(&weight, 1) ); } values[1] = PointerGetDatum( - construct_array(positions, posv->npos, INT2OID, 2, true, 's')); + construct_array(positions, npos, INT2OID, 2, true, 's')); values[2] = PointerGetDatum( - construct_array(weights, posv->npos, TEXTOID, -1, false, 'i')); + construct_array(weights, npos, TEXTOID, -1, false, 'i')); } else { nulls[1] = nulls[2] = true; } + INCRPTR(tsin, entry, intVal(lsecond(funcctx->user_fctx))); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } @@ -728,27 +723,147 @@ Datum tsvector_to_array(PG_FUNCTION_ARGS) { TSVector tsin = PG_GETARG_TSVECTOR(0); - WordEntry *arrin = ARRPTR(tsin); + WordEntry *entry = ARRPTR(tsin); Datum *elements; int i; ArrayType *array; + long pos; - elements = palloc(tsin->size * sizeof(Datum)); + elements = palloc(TS_COUNT(tsin) * sizeof(Datum)); - for (i = 0; i < tsin->size; i++) + INITPOS(pos); + for (i = 0; i < TS_COUNT(tsin); i++) { elements[i] = PointerGetDatum( - cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos, arrin[i].len) - ); + cstring_to_text_with_len(STRPTR(tsin) + pos, ENTRY_LEN(tsin, entry))); + INCRPTR(tsin, entry, pos); } - array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i'); + array = construct_array(elements, TS_COUNT(tsin), TEXTOID, -1, false, 'i'); pfree(elements); PG_FREE_IF_COPY(tsin, 0); PG_RETURN_POINTER(array); } +/* + * Returns offset by given index in TSVector, + * this function used when we need random access + */ +int +tsvector_getoffset(TSVector vec, int idx, WordEntry **we) +{ + int offset = 0; + WordEntry *entry; + + entry = ARRPTR(vec) + idx; + if (we) + *we = entry; + + while (!entry->hasoff) + { + entry--; + if (!entry->hasoff) + offset += SHORTALIGN(entry->len) + entry->npos * sizeof(WordEntryPos); + } + + Assert(entry >= ARRPTR(vec)); + + if (idx % TS_OFFSET_STRIDE) + { + /* if idx is by offset */ + WordEntry *offset_entry = (WordEntry *) (STRPTR(vec) + entry->offset); + + offset += entry->offset + sizeof(WordEntry); + offset += SHORTALIGN(offset_entry->len) + offset_entry->npos * sizeof(WordEntryPos); + } + else + { + Assert(entry == ARRPTR(vec) + idx); + + if (we) + *we = (WordEntry *) (STRPTR(vec) + entry->offset); + offset = entry->offset + sizeof(WordEntry); + } + + return offset; +} + +/* + * Add lexeme and its positions to tsvector and move dataoff (offset where + * data should be added) to new position. + * Returns pointer to lexeme start + */ +char * +tsvector_addlexeme(TSVector tsv, int idx, int *dataoff, + char *lexeme, int lexeme_len, WordEntryPos *pos, int npos) +{ + int stroff; + WordEntry *entry; + char *result; + + /* when idx is 0, dataoff should be 0 too, and otherwise */ + Assert(!((idx == 0) ^ (*dataoff == 0))); + + stroff = *dataoff; + entry = ARRPTR(tsv) + idx; + + if (idx % TS_OFFSET_STRIDE == 0) + { + /* WordEntry with offset */ + WordEntry offentry; + + stroff = INTALIGN(stroff); + entry->hasoff = 1; + entry->offset = stroff; + + /* fill WordEntry for offset */ + offentry.hasoff = 0; + offentry.len = lexeme_len; + offentry.npos = npos; + memcpy(STRPTR(tsv) + stroff, &offentry, sizeof(WordEntry)); + stroff += sizeof(WordEntry); + } + else + { + stroff = SHORTALIGN(stroff); + entry->hasoff = 0; + entry->len = lexeme_len; + entry->npos = npos; + } + + memcpy(STRPTR(tsv) + stroff, lexeme, lexeme_len); + result = STRPTR(tsv) + stroff; + stroff += lexeme_len; + + if (npos) + { + if (npos > 0xFFFF) + elog(ERROR, "positions array too long"); + + /* + * Pad to 2-byte alignment if necessary. We don't know how memory was + * allocated, so in case of aligning we need to make sure that unused + * is zero. + */ + if (stroff != SHORTALIGN(stroff)) + { + *(STRPTR(tsv) + stroff) = '\0'; + stroff = SHORTALIGN(stroff); + } + + /* Copy positions */ + if (pos) + memcpy(STRPTR(tsv) + stroff, pos, npos * sizeof(WordEntryPos)); + + stroff += npos * sizeof(WordEntryPos); + } + + *dataoff = stroff; + + return result; +} + /* * Build tsvector from array of lexemes. */ @@ -758,14 +873,13 @@ array_to_tsvector(PG_FUNCTION_ARGS) ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); TSVector tsout; Datum *dlexemes; - WordEntry *arrout; bool *nulls; int nitems, i, j, tslen, + cur = 0, datalen = 0; - char *cur; deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems); @@ -793,26 +907,24 @@ array_to_tsvector(PG_FUNCTION_ARGS) /* Calculate space needed for surviving lexemes. */ for (i = 0; i < nitems; i++) - datalen += VARSIZE(dlexemes[i]) - VARHDRSZ; + { + int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; + + INCRSIZE(datalen, i, lex_len, 0); + } tslen = CALCDATASIZE(nitems, datalen); /* Allocate and fill tsvector. */ tsout = (TSVector) palloc0(tslen); SET_VARSIZE(tsout, tslen); - tsout->size = nitems; + TS_SETCOUNT(tsout, nitems); - arrout = ARRPTR(tsout); - cur = STRPTR(tsout); for (i = 0; i < nitems; i++) { char *lex = VARDATA(dlexemes[i]); int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; - memcpy(cur, lex, lex_len); - arrout[i].haspos = 0; - arrout[i].len = lex_len; - arrout[i].pos = cur - STRPTR(tsout); - cur += lex_len; + tsvector_addlexeme(tsout, i, &cur, lex, lex_len, NULL, 0); } PG_FREE_IF_COPY(v, 0); @@ -828,17 +940,16 @@ tsvector_filter(PG_FUNCTION_ARGS) TSVector tsin = PG_GETARG_TSVECTOR(0), tsout; ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); - WordEntry *arrin = ARRPTR(tsin), - *arrout; - char *datain = STRPTR(tsin), - *dataout; + char *dataout; Datum *dweights; bool *nulls; int nweights; int i, - j; - int cur_pos = 0; + j, + dataoff = 0, + pos; char mask = 0; + WordEntry *ptr = ARRPTR(tsin); deconstruct_array(weights, CHAROID, 1, true, 'c', &dweights, &nulls, &nweights); @@ -879,109 +990,112 @@ tsvector_filter(PG_FUNCTION_ARGS) } tsout = (TSVector) palloc0(VARSIZE(tsin)); - tsout->size = tsin->size; - arrout = ARRPTR(tsout); + TS_SETCOUNT(tsout, TS_COUNT(tsin)); dataout = STRPTR(tsout); - for (i = j = 0; i < tsin->size; i++) + INITPOS(pos); + for (i = j = 0; i < TS_COUNT(tsin); i++) { - WordEntryPosVector *posvin, - *posvout; - int npos = 0; - int k; - - if (!arrin[i].haspos) - continue; - - posvin = _POSVECPTR(tsin, arrin + i); - posvout = (WordEntryPosVector *) - (dataout + SHORTALIGN(cur_pos + arrin[i].len)); - - for (k = 0; k < posvin->npos; k++) + WordEntryPos *posin, + *posout; + int k, + npos = 0, + lex_len = ENTRY_LEN(tsin, ptr); + char *lex = STRPTR(tsin) + pos, + *lexout; + + posin = POSDATAPTR(lex, lex_len); + for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++) { - if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k]))) - posvout->pos[npos++] = posvin->pos[k]; + if (mask & (1 << WEP_GETWEIGHT(posin[k]))) + npos++; } - /* if no satisfactory positions found, skip lexeme */ if (!npos) - continue; + goto next; - arrout[j].haspos = true; - arrout[j].len = arrin[i].len; - arrout[j].pos = cur_pos; + lexout = tsvector_addlexeme(tsout, j++, &dataoff, lex, lex_len, + NULL, npos); + posout = POSDATAPTR(lexout, lex_len); + npos = 0; + for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++) + { + if (mask & (1 << WEP_GETWEIGHT(posin[k]))) + posout[npos++] = posin[k]; + } - memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len); - posvout->npos = npos; - cur_pos += SHORTALIGN(arrin[i].len); - cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) + - sizeof(uint16); - j++; +next: + INCRPTR(tsin, ptr, pos); } - tsout->size = j; + TS_SETCOUNT(tsout, j); if (dataout != STRPTR(tsout)) - memmove(STRPTR(tsout), dataout, cur_pos); + memmove(STRPTR(tsout), dataout, dataoff); - SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos)); + SET_VARSIZE(tsout, CALCDATASIZE(TS_COUNT(tsout), dataoff)); PG_FREE_IF_COPY(tsin, 0); PG_RETURN_POINTER(tsout); } +/* Get max position in in1; we'll need this to offset in2's positions */ +static int +get_maxpos(TSVector tsv) +{ + int i, + j, + maxpos = 0; + WordEntry *ptr = ARRPTR(tsv); + uint32 pos; + WordEntryPos *apos; + + INITPOS(pos); + for (i = 0; i < TS_COUNT(tsv); i++) + { + apos = POSDATAPTR(STRPTR(tsv) + pos, ENTRY_LEN(tsv, ptr)); + for (j = 0; j < ENTRY_NPOS(tsv, ptr); j++) + { + if (WEP_GETPOS(apos[j]) > maxpos) + maxpos = WEP_GETPOS(apos[j]); + } + + INCRPTR(tsv, ptr, pos); + } + + return maxpos; +} + Datum tsvector_concat(PG_FUNCTION_ARGS) { - TSVector in1 = PG_GETARG_TSVECTOR(0); - TSVector in2 = PG_GETARG_TSVECTOR(1); - TSVector out; - WordEntry *ptr; - WordEntry *ptr1, + TSVector in1 = PG_GETARG_TSVECTOR(0), + in2 = PG_GETARG_TSVECTOR(1), + out; + WordEntry *ptr, + *ptr1, *ptr2; - WordEntryPos *p; int maxpos = 0, i, - j, i1, i2, - dataoff, output_bytes, - output_size; - char *data, - *data1, - *data2; - - /* Get max position in in1; we'll need this to offset in2's positions */ - ptr = ARRPTR(in1); - i = in1->size; - while (i--) - { - if ((j = POSDATALEN(in1, ptr)) != 0) - { - p = POSDATAPTR(in1, ptr); - while (j--) - { - if (WEP_GETPOS(*p) > maxpos) - maxpos = WEP_GETPOS(*p); - p++; - } - } - ptr++; - } + pos1, + pos2, + dataoff; + char *data; ptr1 = ARRPTR(in1); ptr2 = ARRPTR(in2); - data1 = STRPTR(in1); - data2 = STRPTR(in2); - i1 = in1->size; - i2 = in2->size; + i1 = TS_COUNT(in1); + i2 = TS_COUNT(in2); /* * Conservative estimate of space needed. We might need all the data in - * both inputs, and conceivably add a pad byte before position data for - * each item where there was none before. + * both inputs, and conceivably add a pad bytes before lexeme and position + * data, and pad bytes before WordEntry for offset entry. */ - output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2; + output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 * 2 + i2 * 2; + output_bytes += 4 * (i1 + i2) / TS_OFFSET_STRIDE; out = (TSVector) palloc0(output_bytes); SET_VARSIZE(out, output_bytes); @@ -990,91 +1104,110 @@ tsvector_concat(PG_FUNCTION_ARGS) * We must make out->size valid so that STRPTR(out) is sensible. We'll * collapse out any unused space at the end. */ - out->size = in1->size + in2->size; + TS_SETCOUNT(out, i1 + i2); - ptr = ARRPTR(out); + ptr = NULL; data = STRPTR(out); + i = 0; dataoff = 0; + + INITPOS(pos1); + INITPOS(pos2); + + /* + * we will need max position from first tsvector to add it positions of + * second tsvector + */ + maxpos = get_maxpos(in1); + while (i1 && i2) { - int cmp = compareEntry(data1, ptr1, data2, ptr2); + char *lex = STRPTR(in1) + pos1, + *lex2 = STRPTR(in2) + pos2; + + int lex_len = ENTRY_LEN(in1, ptr1), + lex2_len = ENTRY_LEN(in2, ptr2); + + int cmp = tsCompareString(lex, lex_len, lex2, lex2_len, false); if (cmp < 0) { /* in1 first */ - ptr->haspos = ptr1->haspos; - ptr->len = ptr1->len; - memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); - ptr->pos = dataoff; - dataoff += ptr1->len; - if (ptr->haspos) - { - dataoff = SHORTALIGN(dataoff); - memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); - } + tsvector_addlexeme(out, i, &dataoff, + lex, lex_len, + POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1)); - ptr++; - ptr1++; + INCRPTR(in1, ptr1, pos1); i1--; + i++; } else if (cmp > 0) { /* in2 first */ - ptr->haspos = ptr2->haspos; - ptr->len = ptr2->len; - memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); - ptr->pos = dataoff; - dataoff += ptr2->len; - if (ptr->haspos) + char *new_lex; + WordEntry *we = UNWRAP_ENTRY(in2, ptr2); + + new_lex = tsvector_addlexeme(out, i, &dataoff, lex2, lex2_len, NULL, 0); + if (we->npos > 0) { - int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + int addlen; + WordEntryPos *apos = POSDATAPTR(new_lex, lex2_len); - if (addlen == 0) - ptr->haspos = 0; - else + addlen = add_pos(lex2, we, apos, 0, maxpos); + if (addlen > 0) { + ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i); + ptr->npos = addlen; dataoff = SHORTALIGN(dataoff); - dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + dataoff += ptr->npos * sizeof(WordEntryPos); } } - ptr++; - ptr2++; + INCRPTR(in2, ptr2, pos2); + i++; i2--; } else { - ptr->haspos = ptr1->haspos | ptr2->haspos; - ptr->len = ptr1->len; - memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); - ptr->pos = dataoff; - dataoff += ptr1->len; - if (ptr->haspos) + char *new_lex; + int npos1 = ENTRY_NPOS(in1, ptr1), + npos2 = ENTRY_NPOS(in2, ptr2); + WordEntryPos *apos; + + new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0); + apos = POSDATAPTR(new_lex, lex_len); + + if (npos1 || npos2) { - if (ptr1->haspos) - { - dataoff = SHORTALIGN(dataoff); - memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); - if (ptr2->haspos) - dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); - } - else /* must have ptr2->haspos */ + int addlen; + char *lex2 = STRPTR(in2) + pos2; + + ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i); + if (npos1) { - int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + /* add positions from left tsvector */ + addlen = add_pos(lex, UNWRAP_ENTRY(in1, ptr1), apos, 0, 0); + ptr->npos = addlen; - if (addlen == 0) - ptr->haspos = 0; - else + if (npos2) { - dataoff = SHORTALIGN(dataoff); - dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + /* add positions from right right tsvector */ + addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, addlen, maxpos); + ptr->npos += addlen; } } + else /* npos in second should be > 0 */ + { + /* add positions from right tsvector */ + addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos); + ptr->npos = addlen; + } + + dataoff = SHORTALIGN(dataoff); + dataoff += ptr->npos * sizeof(WordEntryPos); } - ptr++; - ptr1++; - ptr2++; + INCRPTR(in1, ptr1, pos1); + INCRPTR(in2, ptr2, pos2); + i++; i1--; i2--; } @@ -1082,45 +1215,44 @@ tsvector_concat(PG_FUNCTION_ARGS) while (i1) { - ptr->haspos = ptr1->haspos; - ptr->len = ptr1->len; - memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); - ptr->pos = dataoff; - dataoff += ptr1->len; - if (ptr->haspos) - { - dataoff = SHORTALIGN(dataoff); - memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); - dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); - } + char *lex = STRPTR(in1) + pos1; + int lex_len = ENTRY_LEN(in1, ptr1); - ptr++; - ptr1++; + tsvector_addlexeme(out, i, &dataoff, + lex, lex_len, + POSDATAPTR(lex, lex_len), ENTRY_NPOS(in1, ptr1)); + + INCRPTR(in1, ptr1, pos1); + i++; i1--; } while (i2) { - ptr->haspos = ptr2->haspos; - ptr->len = ptr2->len; - memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); - ptr->pos = dataoff; - dataoff += ptr2->len; - if (ptr->haspos) + char *lex = STRPTR(in2) + pos2, + *new_lex; + int lex_len = ENTRY_LEN(in2, ptr2), + npos = ENTRY_NPOS(in2, ptr2); + + new_lex = tsvector_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0); + if (npos > 0) { - int addlen = add_pos(in2, ptr2, out, ptr, maxpos); + int addlen; + WordEntryPos *apos = POSDATAPTR(new_lex, lex_len); - if (addlen == 0) - ptr->haspos = 0; - else + addlen = add_pos(lex, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos); + if (addlen > 0) { + WordEntry *ptr = UNWRAP_ENTRY(out, ARRPTR(out) + i); + + ptr->npos = addlen; dataoff = SHORTALIGN(dataoff); - dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); + dataoff += npos * sizeof(WordEntryPos); } } - ptr++; - ptr2++; + INCRPTR(in2, ptr2, pos2); + i++; i2--; } @@ -1137,12 +1269,10 @@ tsvector_concat(PG_FUNCTION_ARGS) * Adjust sizes (asserting that we didn't overrun the original estimates) * and collapse out any unused array entries. */ - output_size = ptr - ARRPTR(out); - Assert(output_size <= out->size); - out->size = output_size; + TS_SETCOUNT(out, i); if (data != STRPTR(out)) memmove(STRPTR(out), data, dataoff); - output_bytes = CALCDATASIZE(out->size, dataoff); + output_bytes = CALCDATASIZE(TS_COUNT(out), dataoff); Assert(output_bytes <= VARSIZE(out)); SET_VARSIZE(out, output_bytes); @@ -1194,35 +1324,26 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix) * Check weight info or/and fill 'data' with the required positions */ static bool -checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, +checkclass_str(WordEntryPos *pv, int npos, QueryOperand *val, ExecPhraseData *data) { bool result = false; - if (entry->haspos && (val->weight || data)) + if (npos && (val->weight || data)) { - WordEntryPosVector *posvec; - - /* - * We can't use the _POSVECPTR macro here because the pointer to the - * tsvector's lexeme storage is already contained in chkval->values. - */ - posvec = (WordEntryPosVector *) - (chkval->values + SHORTALIGN(entry->pos + entry->len)); - if (val->weight && data) { - WordEntryPos *posvec_iter = posvec->pos; + WordEntryPos *posvec_iter = pv; WordEntryPos *dptr; /* * Filter position information by weights */ - dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos); + dptr = data->pos = palloc(sizeof(WordEntryPos) * npos); data->allocated = true; /* Is there a position with a matching weight? */ - while (posvec_iter < posvec->pos + posvec->npos) + while (posvec_iter < (pv + npos)) { /* If true, append this position to the data->pos */ if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) @@ -1241,10 +1362,10 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, } else if (val->weight) { - WordEntryPos *posvec_iter = posvec->pos; + WordEntryPos *posvec_iter = pv; /* Is there a position with a matching weight? */ - while (posvec_iter < posvec->pos + posvec->npos) + while (posvec_iter < (pv + npos)) { if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) { @@ -1257,8 +1378,8 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val, } else /* data != NULL */ { - data->npos = posvec->npos; - data->pos = posvec->pos; + data->npos = npos; + data->pos = pv; data->allocated = false; result = true; } @@ -1311,26 +1432,32 @@ static bool checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) { CHKVAL *chkval = (CHKVAL *) checkval; - WordEntry *StopLow = chkval->arrb; - WordEntry *StopHigh = chkval->arre; - WordEntry *StopMiddle = StopHigh; + int StopLow = chkval->bidx; + int StopHigh = chkval->eidx; + int StopMiddle = StopHigh; int difference = -1; bool res = false; + char *lexeme; + WordEntry *entry; /* Loop invariant: StopLow <= val < StopHigh */ while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; + lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry); + + Assert(!entry->hasoff); difference = tsCompareString(chkval->operand + val->distance, val->length, - chkval->values + StopMiddle->pos, - StopMiddle->len, + lexeme, + entry->len, false); if (difference == 0) { /* Check weight info & fill 'data' with positions */ - res = checkclass_str(chkval, StopMiddle, val, data); + res = checkclass_str(POSDATAPTR(lexeme, entry->len), + entry->npos, val, data); break; } else if (difference > 0) @@ -1352,19 +1479,31 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) if (StopLow >= StopHigh) StopMiddle = StopHigh; - while ((!res || data) && StopMiddle < chkval->arre && - tsCompareString(chkval->operand + val->distance, - val->length, - chkval->values + StopMiddle->pos, - StopMiddle->len, - true) == 0) + while ((!res || data) && StopMiddle < chkval->eidx) { + char *lexeme; + int cmp; + WordEntryPos *pv; + + lexeme = tsvector_getlexeme(chkval->vec, StopMiddle, &entry); + + Assert(!entry->hasoff); + pv = POSDATAPTR(lexeme, entry->len); + cmp = tsCompareString(chkval->operand + val->distance, + val->length, + lexeme, + entry->len, + true); + + if (cmp != 0) + break; + if (data) { /* * We need to join position information */ - res = checkclass_str(chkval, StopMiddle, val, data); + res = checkclass_str(pv, entry->npos, val, data); if (res) { @@ -1388,7 +1527,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) } else { - res = checkclass_str(chkval, StopMiddle, val, NULL); + res = checkclass_str(pv, entry->npos, val, NULL); } StopMiddle++; @@ -1935,9 +2074,9 @@ ts_match_vq(PG_FUNCTION_ARGS) PG_RETURN_BOOL(false); } - chkval.arrb = ARRPTR(val); - chkval.arre = chkval.arrb + val->size; - chkval.values = STRPTR(val); + chkval.bidx = 0; + chkval.eidx = TS_COUNT(val); + chkval.vec = val; chkval.operand = GETOPERAND(query); result = TS_execute(GETQUERY(query), &chkval, @@ -2001,12 +2140,15 @@ ts_match_tq(PG_FUNCTION_ARGS) * that have a weight equal to one of the weights in 'weight' bitmask. */ static int -check_weight(TSVector txt, WordEntry *wptr, int8 weight) +check_weight(char *lexeme, WordEntry *wptr, int8 weight) { - int len = POSDATALEN(txt, wptr); + int len; int num = 0; - WordEntryPos *ptr = POSDATAPTR(txt, wptr); + WordEntryPos *ptr; + Assert(!wptr->hasoff); + len = wptr->len; + ptr = POSDATAPTR(lexeme, len); while (len--) { if (weight & (1 << WEP_GETWEIGHT(*ptr))) @@ -2017,31 +2159,34 @@ check_weight(TSVector txt, WordEntry *wptr, int8 weight) } #define compareStatWord(a,e,t) \ - tsCompareString((a)->lexeme, (a)->lenlexeme, \ - STRPTR(t) + (e)->pos, (e)->len, \ - false) + (tsCompareString((a)->lexeme, (a)->lenlexeme, \ + t, (e)->len, false)) static void insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off) { - WordEntry *we = ARRPTR(txt) + off; + WordEntry *we; StatEntry *node = stat->root, *pnode = NULL; int n, res = 0; uint32 depth = 1; + char *lexeme; + + lexeme = tsvector_getlexeme(txt, off, &we); + Assert(!we->hasoff); if (stat->weight == 0) - n = (we->haspos) ? POSDATALEN(txt, we) : 1; + n = (we->npos) ? we->npos : 1; else - n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0; + n = (we->npos) ? check_weight(lexeme, we, stat->weight) : 0; if (n == 0) return; /* nothing to insert */ while (node) { - res = compareStatWord(node, we, txt); + res = compareStatWord(node, we, lexeme); if (res == 0) { @@ -2065,7 +2210,7 @@ insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector tx node->ndoc = 1; node->nentry = n; node->lenlexeme = we->len; - memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme); + memcpy(node->lexeme, lexeme, node->lenlexeme); if (pnode == NULL) { @@ -2092,13 +2237,14 @@ chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVecto uint32 low, uint32 high, uint32 offset) { uint32 pos; - uint32 middle = (low + high) >> 1; + uint32 middle = (low + high) >> 1, + count = TS_COUNT(txt); pos = (low + middle) >> 1; - if (low != middle && pos >= offset && pos - offset < txt->size) + if (low != middle && pos >= offset && pos - offset < count) insertStatEntry(persistentContext, stat, txt, pos - offset); pos = (high + middle + 1) >> 1; - if (middle + 1 != high && pos >= offset && pos - offset < txt->size) + if (middle + 1 != high && pos >= offset && pos - offset < count) insertStatEntry(persistentContext, stat, txt, pos - offset); if (low != middle) @@ -2125,7 +2271,8 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data) TSVector txt = DatumGetTSVector(data); uint32 i, nbit = 0, - offset; + offset, + count = TS_COUNT(txt); if (stat == NULL) { /* Init in first */ @@ -2134,19 +2281,19 @@ ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data) } /* simple check of correctness */ - if (txt == NULL || txt->size == 0) + if (txt == NULL || count == 0) { if (txt && txt != (TSVector) DatumGetPointer(data)) pfree(txt); return stat; } - i = txt->size - 1; + i = count - 1; for (; i > 0; i >>= 1) nbit++; nbit = 1 << nbit; - offset = (nbit - txt->size) / 2; + offset = (nbit - count) / 2; insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset); chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset); @@ -2579,15 +2726,28 @@ tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column) } /* make tsvector value */ - datum = TSVectorGetDatum(make_tsvector(&prs)); - isnull = false; - - /* and insert it into tuple */ - rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, - 1, &tsvector_attr_num, - &datum, &isnull); - - pfree(DatumGetPointer(datum)); + if (prs.curwords) + { + datum = PointerGetDatum(make_tsvector(&prs)); + isnull = false; + rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, + 1, &tsvector_attr_num, + &datum, &isnull); + pfree(DatumGetPointer(datum)); + } + else + { + TSVector out = palloc(CALCDATASIZE(0, 0)); + + SET_VARSIZE(out, CALCDATASIZE(0, 0)); + TS_SETCOUNT(out, 0); + datum = PointerGetDatum(out); + isnull = false; + rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, + 1, &tsvector_attr_num, + &datum, &isnull); + pfree(prs.words); + } return PointerGetDatum(rettuple); } diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 30d7c4bccd..eb94c595f2 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -24,30 +24,40 @@ * 2) int32 size - number of lexemes (WordEntry array entries) * 3) Array of WordEntry - one per lexeme; must be sorted according to * tsCompareString() (ie, memcmp of lexeme strings). - * WordEntry->pos gives the number of bytes from end of WordEntry - * array to start of lexeme's string, which is of length len. + * WordEntry have two types: offset or metadata (length of lexeme and number + * of positions). If it has offset then metadata will be by this offset. * 4) Per-lexeme data storage: - * lexeme string (not null-terminated) - * if haspos is true: + * [4-byte aligned WordEntry] (if its WordEntry has offset) + * 2-byte aligned lexeme string (not null-terminated) + * if it has positions: * padding byte if necessary to make the position data 2-byte aligned - * uint16 number of positions that follow * WordEntryPos[] positions * * The positions for each lexeme must be sorted. * - * Note, tsvectorsend/recv believe that sizeof(WordEntry) == 4 + * Note, tsvector functions believe that sizeof(WordEntry) == 4 */ -typedef struct +#define TS_OFFSET_STRIDE 4 + +typedef union { - uint32 - haspos:1, - len:11, /* MAX 2Kb */ - pos:20; /* MAX 1Mb */ + struct + { + uint32 hasoff:1, + offset:31; + }; + struct + { + uint32 hasoff_:1, + len:11, + npos:16, + _unused:4; + }; } WordEntry; #define MAXSTRLEN ( (1<<11) - 1) -#define MAXSTRPOS ( (1<<20) - 1) +#define MAXSTRPOS ( (1<<30) - 1) extern int compareWordEntryPos(const void *a, const void *b); @@ -62,19 +72,6 @@ extern int compareWordEntryPos(const void *a, const void *b); typedef uint16 WordEntryPos; -typedef struct -{ - uint16 npos; - WordEntryPos pos[FLEXIBLE_ARRAY_MEMBER]; -} WordEntryPosVector; - -/* WordEntryPosVector with exactly 1 entry */ -typedef struct -{ - uint16 npos; - WordEntryPos pos[1]; -} WordEntryPosVector1; - #define WEP_GETWEIGHT(x) ( (x) >> 14 ) #define WEP_GETPOS(x) ( (x) & 0x3fff ) @@ -90,13 +87,17 @@ typedef struct typedef struct { int32 vl_len_; /* varlena header (do not touch directly!) */ - int32 size; + int32 size_; /* flags and lexemes count */ WordEntry entries[FLEXIBLE_ARRAY_MEMBER]; /* lexemes follow the entries[] array */ } TSVectorData; typedef TSVectorData *TSVector; +#define TS_FLAG_STRETCHED 0x80000000 +#define TS_COUNT(t) ((t)->size_ & 0x0FFFFFFF) +#define TS_SETCOUNT(t,c) ((t)->size_ = (c) | TS_FLAG_STRETCHED) + #define DATAHDRSIZE (offsetof(TSVectorData, entries)) #define CALCDATASIZE(nentries, lenstr) (DATAHDRSIZE + (nentries) * sizeof(WordEntry) + (lenstr) ) @@ -104,24 +105,65 @@ typedef TSVectorData *TSVector; #define ARRPTR(x) ( (x)->entries ) /* pointer to start of a tsvector's lexeme storage */ -#define STRPTR(x) ( (char *) &(x)->entries[(x)->size] ) +#define STRPTR(x) ( (char *) &(x)->entries[TS_COUNT(x)] ) -#define _POSVECPTR(x, e) ((WordEntryPosVector *)(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))) -#define POSDATALEN(x,e) ( ( (e)->haspos ) ? (_POSVECPTR(x,e)->npos) : 0 ) -#define POSDATAPTR(x,e) (_POSVECPTR(x,e)->pos) +/* for WordEntry with offset return its WordEntry with other properties */ +#define UNWRAP_ENTRY(x,we) \ + ((we)->hasoff? (WordEntry *)(STRPTR(x) + (we)->offset): (we)) + +/* + * helpers used when we're not sure that WordEntry + * contains ether offset or len + */ +#define ENTRY_NPOS(x,we) (UNWRAP_ENTRY(x,we)->npos) +#define ENTRY_LEN(x,we) (UNWRAP_ENTRY(x,we)->len) + +/* pointer to start of positions */ +#define POSDATAPTR(lex, len) ((WordEntryPos *) (lex + SHORTALIGN(len))) + +/* set default offset in tsvector data */ +#define INITPOS(p) ((p) = sizeof(WordEntry)) + +/* increment entry and offset by given WordEntry */ +#define INCRPTR(x,w,p) \ +do { \ + WordEntry *y = (w); \ + if ((w)->hasoff) \ + { \ + y = (WordEntry *) (STRPTR(x) + (w)->offset); \ + (p) = (w)->offset + sizeof(WordEntry); \ + } \ + (w)++; \ + Assert(!y->hasoff); \ + (p) += SHORTALIGN(y->len) + y->npos * sizeof(WordEntryPos); \ + if ((w) - ARRPTR(x) < TS_COUNT(x) && w->hasoff) \ + (p) = INTALIGN(p) + sizeof(WordEntry); \ +} while (0); + +/* used to calculate tsvector size in in tsvector constructors */ +#define INCRSIZE(s,i,l,n) /* size,index,len,npos */ \ +do { \ + if ((i) % TS_OFFSET_STRIDE == 0) \ + (s) = INTALIGN(s) + sizeof(WordEntry); \ + else \ + (s) = SHORTALIGN(s); \ + (s) += (l); \ + (s) = (n)? SHORTALIGN(s) + (n) * sizeof(WordEntryPos) : (s); \ +} while (0); /* * fmgr interface macros */ -#define DatumGetTSVector(X) ((TSVector) PG_DETOAST_DATUM(X)) -#define DatumGetTSVectorCopy(X) ((TSVector) PG_DETOAST_DATUM_COPY(X)) +TSVector tsvector_upgrade(Datum orig, bool copy); + +#define DatumGetTSVector(X) tsvector_upgrade((X), false) +#define DatumGetTSVectorCopy(X) tsvector_upgrade((X), true) #define TSVectorGetDatum(X) PointerGetDatum(X) #define PG_GETARG_TSVECTOR(n) DatumGetTSVector(PG_GETARG_DATUM(n)) #define PG_GETARG_TSVECTOR_COPY(n) DatumGetTSVectorCopy(PG_GETARG_DATUM(n)) #define PG_RETURN_TSVECTOR(x) return TSVectorGetDatum(x) - /* * TSQuery * @@ -239,4 +281,22 @@ typedef TSQueryData *TSQuery; #define PG_GETARG_TSQUERY_COPY(n) DatumGetTSQueryCopy(PG_GETARG_DATUM(n)) #define PG_RETURN_TSQUERY(x) return TSQueryGetDatum(x) +int tsvector_getoffset(TSVector vec, int idx, WordEntry **we); +char *tsvector_addlexeme(TSVector tsv, int idx, int *dataoff, + char *lexeme, int lexeme_len, WordEntryPos *pos, int npos); + +/* Returns lexeme and its entry by given index from TSVector */ +inline static char * +tsvector_getlexeme(TSVector vec, int idx, WordEntry **we) +{ + Assert(idx >= 0 && idx < TS_COUNT(vec)); + + /* + * we do not allow we == NULL because returned lexeme is not \0 ended, and + * always should be used with we->len + */ + Assert(we != NULL); + return STRPTR(vec) + tsvector_getoffset(vec, idx, we); +} + #endif /* _PG_TSTYPE_H_ */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers