Here's a WIP patch implementing an oprrest function for tsvector @@
tsquery and tsquery @@ tsvector.
The idea is (quoting a comment)
/*
* Traverse the tsquery preorder, calculating selectivity as:
*
* selec(left_oper) * selec(right_oper) in AND nodes,
*
* selec(left_oper) + selec(right_oper) -
* selec(left_oper) * selec(right_oper) in OR nodes,
*
* 1 - select(oper) in NOT nodes
*
* freq[val] in VAL nodes, if the value is in MCELEM
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
*
*
* Implementation-wise, we sort the MCELEM array to use binary
* search on it.
*/
The patch still has many rough edges, but it applies to HEAD and passes
tests. I'm posting it mostly to get feedback about whether I'm going in
the right direction.
Cheers,
Jan
--
Jan Urbanski
GPG key ID: E583D7D2
ouden estin
diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile
index e20a4a2..ba728eb 100644
*** a/src/backend/tsearch/Makefile
--- b/src/backend/tsearch/Makefile
***************
*** 19,25 ****
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
! to_tsany.o ts_typanalyze.o ts_utils.o
include $(top_srcdir)/src/backend/common.mk
--- 19,25 ----
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
! to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/tsearch/ts_selfuncs.c
b/src/backend/tsearch/ts_selfuncs.c
new file mode 100644
index 0000000..4b86072
*** /dev/null
--- b/src/backend/tsearch/ts_selfuncs.c
***************
*** 0,-1 ****
--- 1,299 ----
+ /*-------------------------------------------------------------------------
+ *
+ * ts_selfuncs.c
+ * Selectivity functions for text search types.
+ *
+ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+ #include "postgres.h"
+
+ #include "miscadmin.h" /* for check_stack_depth() */
+ #include "utils/memutils.h"
+ #include "utils/builtins.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "utils/selfuncs.h"
+ #include "catalog/pg_type.h"
+ #include "catalog/pg_statistic.h"
+ #include "nodes/nodes.h"
+ #include "tsearch/ts_type.h"
+
+ /* lookup table type for binary searching through MCELEMs */
+ typedef struct
+ {
+ Datum element;
+ float4 frequency;
+ } TextFreq;
+
+ static int
+ compare_textfreq(const void *e1, const void *e2);
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ int length, float4 minfreq);
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ float4 *numbers, int
nnumbers);
+ static double
+ tsquerysel(VariableStatData *vardata, Datum constval);
+
+
+ /* TSQuery traversal function */
+ static Selectivity
+ tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup,
+ int length, float4 minfreq)
+ {
+ TextFreq key,
+ *searchres;
+ Selectivity s1, s2;
+
+ /* since this function recurses, it could be driven to stack overflow */
+ check_stack_depth();
+
+ if (item->type == QI_VAL)
+ {
+ QueryOperand *oper = (QueryOperand *) item;
+
+ /*
+ * Prepare the key for bsearch(). No need to initialize
key.frequency,
+ * because sorting is only on key.element.
+ */
+ key.element = PointerGetDatum(
+ cstring_to_text_with_len(operand + oper->distance,
oper->length));
+
+ searchres = (TextFreq *) bsearch(&key, lookup, length,
+
sizeof(TextFreq), compare_textfreq);
+ if (searchres)
+ {
+ /*
+ * The element is in MCELEM. Return precise selectivity
(or at
+ * least as precise, as ANALYZE could find out).
+ */
+ return (Selectivity) searchres->frequency;
+ }
+ else
+ {
+ /*
+ * The element is not in MCELEM. Punt, but assure that
the
+ * selectivity cannot be more than minfreq / 2.
+ */
+ return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2);
+ }
+ }
+
+ /* Current TSQuery node is an operator */
+ switch (item->operator.oper)
+ {
+ case OP_NOT:
+ return 1.0 - tsquery_opr_selec(item + 1, operand,
lookup,
+
length, minfreq);
+
+ case OP_AND:
+ return
+ tsquery_opr_selec(item + 1, operand, lookup,
length, minfreq) *
+ tsquery_opr_selec(item + item->operator.left,
operand, lookup,
+ length,
minfreq);
+
+ case OP_OR:
+ s1 = tsquery_opr_selec(item + 1, operand, lookup,
length, minfreq);
+ s2 = tsquery_opr_selec(item + item->operator.left,
operand, lookup,
+ length,
minfreq);
+ return s1 + s2 - s1 * s2;
+
+ default:
+ elog(ERROR, "unrecognized operator: %d",
item->operator.oper);
+ }
+
+ /* never reached, keep compiler happy */
+ return (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ /*
+ * Traverse the tsquery preorder, calculating selectivity as:
+ *
+ * selec(left_oper) * selec(right_oper) in AND nodes,
+ *
+ * selec(left_oper) + selec(right_oper) -
+ * selec(left_oper) * selec(right_oper) in OR nodes,
+ *
+ * 1 - select(oper) in NOT nodes
+ *
+ * freq[val] in VAL nodes, if the value is in MCELEM
+ * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
+ *
+ *
+ * Implementation-wise, we sort the MCELEM array to use binary
+ * search on it.
+ */
+ static Selectivity
+ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
+ float4 *numbers, int
nnumbers)
+ {
+ float4 minfreq;
+ TextFreq *lookup;
+ Selectivity selec;
+ MemoryContext opr_context,
+ old_context;
+ int i;
+
+ /* Grab the lowest frequency */
+ minfreq = numbers[nnumbers - 1];
+
+ /*
+ * Compute selectivity in child context, because we will possibly do a
lot
+ * of QueryOperand to text transformations, which palloc() memory
+ */
+ opr_context = AllocSetContextCreate(CurrentMemoryContext,
+
"Text search restriction",
+
ALLOCSET_DEFAULT_MINSIZE,
+
ALLOCSET_DEFAULT_INITSIZE,
+
ALLOCSET_DEFAULT_MAXSIZE);
+ old_context = MemoryContextSwitchTo(opr_context);
+
+
+ /* Construct the array to be qsort()'ed */
+ Assert(nmcelem == nnumbers);
+ lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
+ for (i = 0; i < nmcelem; i++)
+ {
+ lookup[i].element = mcelem[i];
+ lookup[i].frequency = numbers[i];
+ }
+
+ qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq);
+
+ selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
+ nmcelem, minfreq);
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextDelete(opr_context);
+
+ return selec;
+ }
+
+ static int
+ compare_textfreq(const void *e1, const void *e2)
+ {
+ const TextFreq *t1 = (const TextFreq *) e1;
+ const TextFreq *t2 = (const TextFreq *) e2;
+
+ return DatumGetInt32(DirectFunctionCall2(bttextcmp,
+
t1->element, t2->element));
+ }
+
+ /*
+ * tssel -- Selectivity of "@@"
+ */
+ Datum
+ tssel(PG_FUNCTION_ARGS)
+ {
+ PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+ /* We skip arg #2, which is the operator OID - it's gonna be "@@" */
+ List *args = (List *) PG_GETARG_POINTER(2);
+ int varRelid = PG_GETARG_INT32(3);
+ VariableStatData vardata;
+ Node *other;
+ bool varonleft;
+ Selectivity selec;
+
+ /*
+ * If expression is not variable op something or something op variable,
+ * then punt and return a default estimate.
+ */
+ if (!get_restriction_variable(root, args, varRelid,
+ &vardata,
&other, &varonleft))
+ {
+ PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ }
+
+ /*
+ * Can't do anything useful if the something is not a constant, either.
+ */
+ if (!IsA(other, Const))
+ {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8(DEFAULT_TS_SEL);
+ }
+
+ /* The "@@" operator is strict, so might cope with NULL right away */
+ if (((Const *) other)->constisnull) {
+ ReleaseVariableStats(vardata);
+ PG_RETURN_FLOAT8((float8) 0.0);
+ }
+
+ /*
+ * OK, there's a Var and a Const we're dealing with here. We need the
Var
+ * to be a TSVector (or else we don't have any useful statistic for it).
+ */
+
+ if (vardata.vartype == TSVECTOROID)
+ {
+ /* tsvector @@ tsquery or the other way around */
+ Assert(((Const *) other)->consttype == TSQUERYOID);
+
+ selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
+ }
+ else
+ {
+ /* The Var is something we don't have useful statistic for */
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ ReleaseVariableStats(vardata);
+
+ PG_RETURN_FLOAT8((float8) selec);
+ }
+
+ static Selectivity
+ tsquerysel(VariableStatData *vardata, Datum constval)
+ {
+ Selectivity selec;
+
+ if (HeapTupleIsValid(vardata->statsTuple))
+ {
+ TSQuery query;
+ Form_pg_statistic stats;
+ Datum *values;
+ int nvalues;
+ float4 *numbers;
+ int nnumbers;
+
+ /* The caller made sure the const is a TSQuery, so get it now */
+ query = DatumGetTSQuery(constval);
+
+ stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+ /* MCELEM will be an array of Text elements for a tsvector
column */
+ if (get_attstatsslot(vardata->statsTuple,
+ TEXTOID, -1,
+ STATISTIC_KIND_MCELEM,
InvalidOid,
+ &values, &nvalues,
+ &numbers, &nnumbers))
+ {
+ /*
+ * There is a most-common-elements slot for the
tsvector Var, so
+ * use that.
+ */
+
+ selec = mcelem_tsquery_selec(query, values, nvalues,
+
numbers, nnumbers);
+ free_attstatsslot(TEXTOID, values, nvalues, numbers,
nnumbers);
+ }
+ else
+ {
+ /* No most-common-elements slot */
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+ }
+ else
+ {
+ selec = (Selectivity) DEFAULT_TS_SEL;
+ }
+
+ return selec;
+ }
diff --git a/src/include/catalog/pg_operator.h
b/src/include/catalog/pg_operator.h
index 1c7f37d..451805d 100644
*** a/src/include/catalog/pg_operator.h
--- b/src/include/catalog/pg_operator.h
***************
*** 915,924 ****
DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614
16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel ));
DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16
3627 3628 tsvector_gt scalargtsel scalargtjoinsel ));
DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614
3614 0 0 tsvector_concat - - ));
! DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615
16 3637 0 ts_match_vq contsel contjoinsel ));
! DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614
16 3636 0 ts_match_qv contsel contjoinsel ));
! DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16
3661 0 ts_match_vq contsel contjoinsel ));
! DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16
3660 0 ts_match_qv contsel contjoinsel ));
DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16
3679 3678 tsquery_lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615
16 3678 3679 tsquery_le scalarltsel scalarltjoinsel ));
DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16
3676 3677 tsquery_eq eqsel eqjoinsel ));
--- 915,924 ----
DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614
16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel ));
DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16
3627 3628 tsvector_gt scalargtsel scalargtjoinsel ));
DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614
3614 0 0 tsvector_concat - - ));
! DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615
16 3637 0 ts_match_vq tssel contjoinsel ));
! DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614
16 3636 0 ts_match_qv tssel contjoinsel ));
! DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16
3661 0 ts_match_vq tssel contjoinsel ));
! DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16
3660 0 ts_match_qv tssel contjoinsel ));
DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16
3679 3678 tsquery_lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615
16 3678 3679 tsquery_le scalarltsel scalarltjoinsel ));
DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16
3676 3677 tsquery_eq eqsel eqjoinsel ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 16ccb55..ef24f40 100644
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4321,4326 ****
--- 4321,4328 ----
DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12
1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_
gtsquery_consistent _null_ _null_ _null_ ));
DESCR("GiST tsquery support");
+ DATA(insert OID = 3687 ( tssel PGNSP PGUID 12 1 0 0 f f t f s 4 701
"2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ ));
+ DESCR("restriction selectivity of tsvector @@ tsquery");
DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 0 f f t f s
1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ ));
DESCR("tsvector typanalyze");
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 06ca1f9..6208432 100644
*** a/src/include/tsearch/ts_type.h
--- b/src/include/tsearch/ts_type.h
***************
*** 291,294 ****
--- 291,307 ----
extern Datum tsq_mcontains(PG_FUNCTION_ARGS);
extern Datum tsq_mcontained(PG_FUNCTION_ARGS);
+ /*
+ * The default text search selectivity is chosen to be smll enough to
+ * encourage indexscans for typical table densities. See selfuncs.h and
+ * DEFAULT_EQ_SEL for details.
+ */
+ #define DEFAULT_TS_SEL 0.005
+
+ /*
+ * operator restriction function for tsvector @@ tsquery and
+ * tsquery @@ tsvector
+ */
+ extern Datum tssel(PG_FUNCTION_ARGS);
+
#endif /* _PG_TSTYPE_H_ */
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers