Here's a WIP patch implementing an oprrest function for tsvector @@ tsquery and tsquery @@ tsvector.

The idea is (quoting a comment)
/*
 *  Traverse the tsquery preorder, calculating selectivity as:
 *
 *   selec(left_oper) * selec(right_oper) in AND nodes,
 *
 *   selec(left_oper) + selec(right_oper) -
 *      selec(left_oper) * selec(right_oper) in OR nodes,
 *
 *   1 - select(oper) in NOT nodes
 *
 *   freq[val] in VAL nodes, if the value is in MCELEM
 *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
 *
 *
 * Implementation-wise, we sort the MCELEM array to use binary
 * search on it.
 */

The patch still has many rough edges, but it applies to HEAD and passes tests. I'm posting it mostly to get feedback about whether I'm going in the right direction.

Cheers,
Jan

--
Jan Urbanski
GPG key ID: E583D7D2

ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 ****
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
--- 19,25 ----
  OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
        dict_simple.o dict_synonym.o dict_thesaurus.o \
        dict_ispell.o regis.o spell.o \
!       to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
  
  include $(top_srcdir)/src/backend/common.mk
  
diff --git a/src/backend/tsearch/ts_selfuncs.c 
b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4b86072
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0,-1 ****
--- 1,299 ----
+ /*-------------------------------------------------------------------------
+  *
+  * ts_selfuncs.c
+  *      Selectivity functions for text search types.
+  *
+  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+  *
+  *
+  * IDENTIFICATION
+  *      $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ 
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+ 
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+       Datum   element;
+       float4  frequency;
+ } TextFreq;
+ 
+ static int
+ compare_textfreq(const void *e1, const void *e2);
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers);
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+ 
+ 
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+                                 int length, float4 minfreq)
+ {
+       TextFreq        key,
+                               *searchres;
+       Selectivity     s1, s2;
+ 
+       /* since this function recurses, it could be driven to stack overflow */
+       check_stack_depth();
+ 
+       if (item->type == QI_VAL)
+       {
+               QueryOperand *oper = (QueryOperand *) item;
+ 
+               /*
+                * Prepare the key for bsearch(). No need to initialize 
key.frequency,
+                * because sorting is only on key.element.
+                */
+               key.element = PointerGetDatum(
+                       cstring_to_text_with_len(operand + oper->distance, 
oper->length));
+ 
+               searchres = (TextFreq *) bsearch(&key, lookup, length,
+                                                                               
 sizeof(TextFreq), compare_textfreq);
+               if (searchres)
+               {
+                       /*
+                        * The element is in MCELEM. Return precise selectivity 
(or at
+                        * least as precise, as ANALYZE could find out).
+                        */
+                       return (Selectivity) searchres->frequency;
+               }
+               else
+               {
+                       /*
+                        * The element is not in MCELEM. Punt, but  assure that 
the
+                        * selectivity cannot be more than minfreq / 2.
+                        */
+                       return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+               }
+       }
+ 
+       /* Current TSQuery node is an operator */
+       switch (item->operator.oper)
+       {
+               case OP_NOT:
+                       return 1.0 - tsquery_opr_selec(item + 1, operand, 
lookup,
+                                                                               
   length, minfreq);
+ 
+               case OP_AND:
+                       return
+                               tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq) *
+                               tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                 length, 
minfreq);
+ 
+               case OP_OR:
+                       s1 = tsquery_opr_selec(item + 1, operand, lookup, 
length, minfreq);
+                       s2 = tsquery_opr_selec(item + item->operator.left, 
operand, lookup,
+                                                                  length, 
minfreq);
+                       return s1 + s2 - s1 * s2;
+ 
+               default:
+                       elog(ERROR, "unrecognized operator: %d", 
item->operator.oper);
+       }
+ 
+       /* never reached, keep compiler happy */
+       return (Selectivity) DEFAULT_TS_SEL;
+ }
+ 
+ /*
+  *  Traverse the tsquery preorder, calculating selectivity as:
+  *
+  *   selec(left_oper) * selec(right_oper) in AND nodes,
+  *
+  *   selec(left_oper) + selec(right_oper) -
+  *      selec(left_oper) * selec(right_oper) in OR nodes,
+  *
+  *   1 - select(oper) in NOT nodes
+  *
+  *   freq[val] in VAL nodes, if the value is in MCELEM
+  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+  *
+  *
+  * Implementation-wise, we sort the MCELEM array to use binary
+  * search on it.
+  */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+                                                  float4 *numbers, int 
nnumbers)
+ {
+       float4                  minfreq;
+       TextFreq                *lookup;
+       Selectivity             selec;
+       MemoryContext   opr_context,
+                                       old_context;
+       int                             i;
+ 
+       /* Grab the lowest frequency */
+       minfreq = numbers[nnumbers - 1];
+ 
+       /*
+        * Compute selectivity in child context, because we will possibly do a 
lot
+        * of QueryOperand to text transformations, which palloc() memory
+        */
+       opr_context = AllocSetContextCreate(CurrentMemoryContext,
+                                                                               
"Text search restriction",
+                                                                               
ALLOCSET_DEFAULT_MINSIZE,
+                                                                               
ALLOCSET_DEFAULT_INITSIZE,
+                                                                               
ALLOCSET_DEFAULT_MAXSIZE);
+       old_context = MemoryContextSwitchTo(opr_context);
+ 
+ 
+       /* Construct the array to be qsort()'ed */
+       Assert(nmcelem == nnumbers);
+       lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+       for (i = 0; i < nmcelem; i++)
+       {
+               lookup[i].element = mcelem[i];
+               lookup[i].frequency = numbers[i];
+       }
+ 
+       qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq);
+ 
+       selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+                                                         nmcelem, minfreq);
+ 
+       MemoryContextSwitchTo(old_context);
+       MemoryContextDelete(opr_context);
+ 
+       return selec;
+ }
+ 
+ static int
+ compare_textfreq(const void *e1, const void *e2)
+ {
+       const TextFreq *t1 = (const TextFreq *) e1;
+       const TextFreq *t2 = (const TextFreq *) e2;
+ 
+       return DatumGetInt32(DirectFunctionCall2(bttextcmp,
+                                                                               
         t1->element, t2->element));
+ }
+ 
+ /*
+  *    tssel -- Selectivity of "@@"
+  */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+       PlannerInfo             *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+       /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+       List                    *args = (List *) PG_GETARG_POINTER(2);
+       int                             varRelid = PG_GETARG_INT32(3);
+       VariableStatData vardata;
+       Node                    *other;
+       bool                    varonleft;
+       Selectivity             selec;
+ 
+       /*
+        * If expression is not variable op something or something op variable,
+        * then punt and return a default estimate.
+        */
+       if (!get_restriction_variable(root, args, varRelid,
+                                                                 &vardata, 
&other, &varonleft))
+       {
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /*
+        * Can't do anything useful if the something is not a constant, either.
+        */
+       if (!IsA(other, Const))
+       {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+       }
+ 
+       /* The "@@" operator is strict, so might cope with NULL right away */
+       if (((Const *) other)->constisnull) {
+               ReleaseVariableStats(vardata);
+               PG_RETURN_FLOAT8((float8) 0.0);
+       }
+ 
+       /*
+        * OK, there's a Var and a Const we're dealing with here. We need the 
Var
+        * to be a TSVector (or else we don't have any useful statistic for it).
+        */
+ 
+       if (vardata.vartype == TSVECTOROID)
+       {
+               /* tsvector @@ tsquery or the other way around */
+               Assert(((Const *) other)->consttype == TSQUERYOID);
+ 
+               selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+       }
+       else
+       {
+               /* The Var is something we don't have useful statistic for */
+               selec = (Selectivity) DEFAULT_TS_SEL;
+       }
+ 
+       ReleaseVariableStats(vardata);
+ 
+       PG_RETURN_FLOAT8((float8) selec);
+ }
+ 
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+       Selectivity                     selec;
+ 
+       if (HeapTupleIsValid(vardata->statsTuple))
+       {
+               TSQuery                         query;
+               Form_pg_statistic       stats;
+               Datum                           *values;
+               int                                     nvalues;
+               float4                          *numbers;
+               int                                     nnumbers;
+ 
+               /* The caller made sure the const is a TSQuery, so get it now */
+               query = DatumGetTSQuery(constval);
+ 
+               stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+ 
+               /* MCELEM will be an array of Text elements for a tsvector 
column */
+               if (get_attstatsslot(vardata->statsTuple,
+                                                        TEXTOID, -1,
+                                                        STATISTIC_KIND_MCELEM, 
InvalidOid,
+                                                        &values, &nvalues,
+                                                        &numbers, &nnumbers))
+               {
+                       /*
+                        * There is a most-common-elements slot for the 
tsvector Var, so
+                        * use that.
+                        */
+ 
+                       selec = mcelem_tsquery_selec(query, values, nvalues,
+                                                                               
 numbers, nnumbers);
+                       free_attstatsslot(TEXTOID, values, nvalues, numbers, 
nnumbers);
+               }
+               else
+               {
+                       /* No most-common-elements slot */
+                       selec = (Selectivity) DEFAULT_TS_SEL;
+               }
+       }
+       else
+       {
+               selec = (Selectivity) DEFAULT_TS_SEL;
+       }
+ 
+       return selec;
+ }
diff --git a/src/include/catalog/pg_operator.h 
b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 ****
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   contsel     contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   contsel     contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   contsel     contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
--- 915,924 ----
  DATA(insert OID = 3631 (  ">="           PGNSP PGUID b f f 3614        3614   
 16 3628 3627    tsvector_ge scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3632 (  ">"    PGNSP PGUID b f f 3614        3614    16 
3627 3628    tsvector_gt scalargtsel scalargtjoinsel ));
  DATA(insert OID = 3633 (  "||"           PGNSP PGUID b f f 3614        3614   
 3614  0        0        tsvector_concat   -    -         ));
! DATA(insert OID = 3636 (  "@@"           PGNSP PGUID b f f 3614        3615   
 16 3637        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3637 (  "@@"           PGNSP PGUID b f f 3615        3614   
 16 3636        0        ts_match_qv   tssel       contjoinsel   ));
! DATA(insert OID = 3660 (  "@@@"    PGNSP PGUID b f f 3614      3615    16 
3661        0        ts_match_vq   tssel       contjoinsel   ));
! DATA(insert OID = 3661 (  "@@@"    PGNSP PGUID b f f 3615      3614    16 
3660        0        ts_match_qv   tssel       contjoinsel   ));
  DATA(insert OID = 3674 (  "<"    PGNSP PGUID b f f 3615        3615    16 
3679 3678    tsquery_lt scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3675 (  "<="           PGNSP PGUID b f f 3615        3615   
 16 3678 3679    tsquery_le scalarltsel scalarltjoinsel ));
  DATA(insert OID = 3676 (  "="    PGNSP PGUID b t f 3615        3615    16 
3676 3677    tsquery_eq eqsel eqjoinsel ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4321,4326 ****
--- 4321,4328 ----
  DATA(insert OID = 3701 (  gtsquery_consistent                 PGNSP PGUID 12 
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ 
gtsquery_consistent _null_ _null_ _null_ ));
  DESCR("GiST tsquery support");
  
+ DATA(insert OID = 3687 (  tssel       PGNSP PGUID 12 1 0 0 f f t f s 4 701 
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+ DESCR("restriction selectivity of tsvector @@ tsquery");
  DATA(insert OID = 3688 (  ts_typanalyze       PGNSP PGUID 12 1 0 0 f f t f s 
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
  DESCR("tsvector typanalyze");
  
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..6208432 100644
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 291,294 ****
--- 291,307 ----
  extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
  extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
  
+ /*
+  * The default text search selectivity is chosen to be smll enough to
+  * encourage indexscans for typical table densities. See selfuncs.h and
+  * DEFAULT_EQ_SEL for details.
+  */
+ #define DEFAULT_TS_SEL 0.005
+ 
+ /*
+  * operator restriction function for tsvector @@ tsquery and
+  * tsquery @@ tsvector
+  */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+ 
  #endif   /* _PG_TSTYPE_H_ */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to