Re: [HACKERS] gsoc, oprrest function for text search take 2

Jan Urbański Thu, 14 Aug 2008 13:28:57 -0700

Jan Urbański wrote:

Heikki Linnakangas wrote:
Jan Urbański wrote:
So right now the idea is to:
 (1) pre-sort STATISTIC_KIND_MCELEM values
 (2) build an array of pointers to detoasted values in tssel()
 (3) use binary search when looking for MCELEMs during tsquery analysis
Sounds like a plan. In (2), it's even better to detoast the valueslazily. For a typical one-word tsquery, the binary search will onlylook at a small portion of the elements.
Here's another version.


Context diff this time, always forget to convert them...

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin

*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 **** DICTFILES=synonym_sample.syn thesaurus_sample.ths 
hunspell_sample.affix \
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0 ****
--- 1,320 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_selfuncs.c
+  *      Selectivity functions for text search types.
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  *
+  * IDENTIFICATION
+  *      $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+ 
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+       Datum   element;
+       float4  frequency;
+ } TextFreq;
+ 
+ /* type of keys for bsearch()ing through an array of TextFreqs  */
+ typedef struct
+ {
+       char    *lexeme;
+       int             length;
+ } LexemeKey;
+ 
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2);
+ 
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers);
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+ 
+ 
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq)
+ {
+       LexemeKey       key;
+       TextFreq        *searchres;
+       Selectivity     s1, s2;
+ 
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+ 
+       if (item->type == QI_VAL)
+       {
+               QueryOperand *oper = (QueryOperand *) item;
+ 
+               /*
+                * Prepare the key for bsearch().
+                */
+               key.lexeme = operand + oper->distance;
+               key.length = oper->length;
+ 
+               searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                                                               
 sizeof(TextFreq), compare_lexeme_textfreq);
+ 
+               if (searchres)
+               {
+                       /*
+                        * The element is in MCELEM. Return precise selectivity 
(or at
+                        * least as precise, as ANALYZE could find out).
+                        */
+                       return (Selectivity) searchres->frequency;
+               }
+               else
+               {
+                       /*
+                        * The element is not in MCELEM. Punt, but assert that 
the
+                        * selectivity cannot be more than minfreq / 2.
+                        */
+                       return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+               }
+       }
+ 
+       /* Current TSQuery node is an operator */
+       switch (item->operator.oper)
+       {
+               case OP_NOT:
+                       return 1.0 - tsquery_opr_selec(item + 1, operand, 
lookup,
+                                                                               
   length, minfreq);
+ 
+               case OP_AND:
+                       return
+                               tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq) *
+                               tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                 length, 
minfreq);
+ 
+               case OP_OR:
+                       s1 = tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq);
+                       s2 = tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                  length, 
minfreq);
+                       return s1 + s2 - s1 * s2;
+ 
+               default:
+                       elog(ERROR, "unrecognized operator: %d", 
item->operator.oper);
+       }
+ 
+       /* never reached, keep compiler happy */
+       return (Selectivity) DEFAULT_TS_SEL;
+ }
+ 
+ /*
+  * Traverse the tsquery preorder, calculating selectivity as:
+  *
+  *   selec(left_oper) * selec(right_oper) in AND nodes,
+  *
+  *   selec(left_oper) + selec(right_oper) -
+  *      selec(left_oper) * selec(right_oper) in OR nodes,
+  *
+  *   1 - select(oper) in NOT nodes
+  *
+  *   freq[val] in VAL nodes, if the value is in MCELEM
+  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+  *
+  *
+  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
+  * binary search for determining freq[MCELEM].
+  */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers)
+ {
+       float4                  minfreq;
+       TextFreq                *lookup;
+       Selectivity             selec;
+       int                             i;
+ 
+       /* Grab the lowest frequency */
+ 
+       /*
+         XXX This is no longer easy, need to go through the whole list. Plug it
+         with 2.0 for the moment (so that minfreq / 2 is 1.0)
+ 
+         minfreq = numbers[nnumbers - 1];
+       */
+       minfreq = 2.0;
+ 
+       /* Construct the array for binary search */
+       Assert(nmcelem == nnumbers);
+       lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+       for (i = 0; i < nmcelem; i++)
+       {
+               lookup[i].element = mcelem[i];
+               lookup[i].frequency = numbers[i];
+       }
+ 
+       selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                                                         nmcelem, minfreq);
+ 
+       pfree(lookup);
+ 
+       return selec;
+ }
+ 
+ /*
+  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
+  * and a TextFreq. Use length, then byte-for-byte comparision, because that's
+  * how ANALYZE code sorted data before storing it in a statistic tuple.
+  * See ts_typanalyze.c for details.
+  */
+ static int
+ compare_lexeme_textfreq(const void *e1, const void *e2)
+ {
+       const LexemeKey *key;
+       const TextFreq  *t;
+       text                    *element;
+       int                             len1,
+                                       len2;
+ 
+       key = (const LexemeKey *) e1;
+       t = (const TextFreq *) e2;
+ 
+       len1 = key->length;
+       len2 = VARSIZE_ANY_EXHDR(t->element);
+ 
+       /* Compare lengths first, possibly avoiding pointless detoasting */
+       if (len1 > len2)
+               return 1;
+       else if (len1 < len2)
+               return -1;
+ 
+       /* Fall back on byte-for-byte comparision, detoasting the Datum first */
+       element = DatumGetTextP(t->element);
+ 
+       return strncmp(key->lexeme, VARDATA_ANY(element), len1);
+ }
+ 
+ /*
+  *    tssel -- Selectivity of "@@"
+  */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+       PlannerInfo             *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+       /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+       List                    *args = (List *) PG_GETARG_POINTER(2);
+       int                             varRelid = PG_GETARG_INT32(3);
+       VariableStatData vardata;
+       Node                    *other;
+       bool                    varonleft;
+       Selectivity             selec;
+ 
+       /*
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
+        */
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, 
&other, &varonleft))
+       {
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /*
+        * Can't do anything useful if the something is not a constant, either.
+        */
+       if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /* The "@@" operator is strict, so might cope with NULL right away */
+       if (((Const *) other)->constisnull) {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8((float8) 0.0);
+       }
+ 
+       /*
+        * OK, there's a Var and a Const we're dealing with here. We need the 
Var
+        * to be a TSVector (or else we don't have any useful statistic for it).
+        */
+ 
+       if (vardata.vartype == TSVECTOROID)
+       {
+               /* tsvector @@ tsquery or the other way around */
+               Assert(((Const *) other)->consttype == TSQUERYOID);
+ 
+               selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+       }
+       else
+       {
+               /* The Var is something we don't have useful statistic for */
+               selec = DEFAULT_TS_SEL;
+       }
+ 
+       ReleaseVariableStats(vardata);
+ 
+       PG_RETURN_FLOAT8((float8) selec);
+ }
+ 
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+       Selectivity                     selec;
+ 
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               TSQuery                         query;
+               Form_pg_statistic       stats;
+               Datum                           *values;
+               int                                     nvalues;
+               float4                          *numbers;
+               int                                     nnumbers;
+ 
+               /* The caller made sure the const is a TSQuery, so get it now */
+               query = DatumGetTSQuery(constval);
+ 
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+ 
+               /* MCELEM will be an array of Text elements for a tsvector 
column */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        TEXTOID, -1,
+                                                        STATISTIC_KIND_MCELEM, 
InvalidOid,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
+               {
+                       /*
+                        * There is a most-common-elements slot for the 
tsvector Var, so
+                        * use that.
+                        */
+ 
+                       selec = mcelem_tsquery_selec(query, values, nvalues,
+                                                                               
 numbers, nnumbers);
+                       free_attstatsslot(TEXTOID, values, nvalues, numbers, 
nnumbers);
+               }
+               else
+               {
+                       /* No most-common-elements slot */
+                       selec = (Selectivity) DEFAULT_TS_SEL;
+               }
+       }
+       else
+       {
+               selec = (Selectivity) DEFAULT_TS_SEL;
+       }
+ 
+       return selec;
+ }
*** a/src/backend/tsearch/ts_typanalyze.c
--- b/src/backend/tsearch/ts_typanalyze.c
***************
*** 43,50 **** static void compute_tsvector_stats(VacAttrStats *stats,
  static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
  static uint32 lexeme_hash(const void *key, Size keysize);
  static int lexeme_match(const void *key1, const void *key2, Size keysize);
! static int trackitem_compare_desc(const void *e1, const void *e2);
! 
  
  /*
   *    ts_typanalyze -- a custom typanalyze function for tsvector columns
--- 43,51 ----
  static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
  static uint32 lexeme_hash(const void *key, Size keysize);
  static int lexeme_match(const void *key1, const void *key2, Size keysize);
! static int lexeme_compare(const void *key1, const void *key2);
! static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
! static int trackitem_compare_lexemes(const void *e1, const void *e2);
  
  /*
   *    ts_typanalyze -- a custom typanalyze function for tsvector columns
***************
*** 273,279 **** compute_tsvector_stats(VacAttrStats *stats,
                Assert(i == track_len);
  
                qsort(sort_table, track_len, sizeof(TrackItem *),
!                         trackitem_compare_desc);
  
                /* Suppress any single-occurrence items */
                while (track_len > 0)
--- 274,280 ----
                Assert(i == track_len);
  
                qsort(sort_table, track_len, sizeof(TrackItem *),
!                         trackitem_compare_frequencies_desc);
  
                /* Suppress any single-occurrence items */
                while (track_len > 0)
***************
*** 287,292 **** compute_tsvector_stats(VacAttrStats *stats,
--- 288,309 ----
                if (num_mcelem > track_len)
                        num_mcelem = track_len;
  
+               /*
+                * We want to store statistics sorted on the lexeme value using 
first
+                * length, then byte-for-byte comparision. The reason for doing 
length
+                * comparision first is that we don't care about the ordering 
as long
+                * as it's consistent and comparing lengths first gives us a 
chance to
+                * avoid a strncmp() call.
+                *
+                * This is different from what we do with scalar statistics -- 
they get
+                * sorted on frequencies. The rationale is that we usually 
search
+                * through most common elements looking for a specific value, 
so we can
+                * grab its frequency. When values are presorted we can employ 
binary
+                * search for that. See ts_selfuncs.c for a real usage scenario.
+                */
+               qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+                         trackitem_compare_lexemes);
+ 
                /* Generate MCELEM slot entry */
                if (num_mcelem > 0)
                {
***************
*** 379,403 **** lexeme_hash(const void *key, Size keysize)
  static int
  lexeme_match(const void *key1, const void *key2, Size keysize)
  {
!       const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
!       const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
  
!       /* The lexemes need to have the same length, and be memcmp-equal */
!       if (d1->length == d2->length &&
!               memcmp(d1->lexeme, d2->lexeme, d1->length) == 0)
!               return 0;
!       else
                return 1;
  }
  
  /*
!  *    qsort() comparator for TrackItems - LC style (descending sort)
   */
  static int
! trackitem_compare_desc(const void *e1, const void *e2)
  {
        const TrackItem * const *t1 = (const TrackItem * const *) e1;
        const TrackItem * const *t2 = (const TrackItem * const *) e2;
  
        return (*t2)->frequency - (*t1)->frequency;
  }
--- 396,444 ----
  static int
  lexeme_match(const void *key1, const void *key2, Size keysize)
  {
!       /* The keysize parameter is superfluous, the keys store their lengths */
!       return lexeme_compare(key1, key2);
! }
  
! /*
!  *    Comparision function for lexemes.
!  */
! static int
! lexeme_compare(const void *key1, const void *key2)
! {
!       const LexemeHashKey     *d1 = (const LexemeHashKey *) key1;
!       const LexemeHashKey     *d2 = (const LexemeHashKey *) key2;
! 
!       /* First, compare by length */
!       if (d1->length > d2->length)
                return 1;
+       else if (d1->length < d2->length)
+               return -1;
+       else
+               /* Lengths are equal, do a byte-for-byte comparision */
+               return strncmp(d1->lexeme, d2->lexeme, d1->length);
  }
  
  /*
!  *    qsort() comparator for sorting TrackItems on frequencies (descending 
sort)
   */
  static int
! trackitem_compare_frequencies_desc(const void *e1, const void *e2)
  {
        const TrackItem * const *t1 = (const TrackItem * const *) e1;
        const TrackItem * const *t2 = (const TrackItem * const *) e2;
  
        return (*t2)->frequency - (*t1)->frequency;
  }
+ 
+ /*
+  *    qsort() comparator for sorting TrackItem on lexemes
+  */
+ static int
+ trackitem_compare_lexemes(const void *e1, const void *e2)
+ {
+       const TrackItem * const *t1 = (const TrackItem * const *) e1;
+       const TrackItem * const *t2 = (const TrackItem * const *) e2;
+ 
+       return lexeme_compare(&(*t1)->key, &(*t2)->key);
+ }
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 **** DATA(insert OID = 3630 (  "<>"    PGNSP PGUID b f f 3614       
 3614    16 3630 3629    ts
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   contsel     contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   contsel     contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
--- 915,924 ----
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   tssel       contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   tssel       contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4321,4326 **** DESCR("GiST tsquery support");
--- 4321,4328 ----
  DATA(insert OID = 3701 (  gtsquery_consistent                 PGNSP PGUID 12 
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ 
gtsquery_consistent _null_ _null_ _null_ ));
  DESCR("GiST tsquery support");
  
+ DATA(insert OID = 3687 (  tssel       PGNSP PGUID 12 1 0 0 f f t f s 4 701 
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+ DESCR("restriction selectivity of tsvector @@ tsquery");
  DATA(insert OID = 3688 (  ts_typanalyze       PGNSP PGUID 12 1 0 0 f f t f s 
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
  DESCR("tsvector typanalyze");
  
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 155,160 **** extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);
--- 155,162 ----
  
  extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
  
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  
  /*
   * TSQuery
***************
*** 291,294 **** extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS);
--- 293,309 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
  
+ /*
+  * The default text search selectivity is chosen to be smll enough to
+  * encourage indexscans for typical table densities. See selfuncs.h and
+  * DEFAULT_EQ_SEL for details.
+  */
+ #define DEFAULT_TS_SEL 0.005
+ 
+ /*
+  * operator restriction function for tsvector @@ tsquery and
+  * tsquery @@ tsvector
+  */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  #endif   /* _PG_TSTYPE_H_ */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] gsoc, oprrest function for text search take 2

Reply via email to