Re: [SQL] problem (bug?) with "in (subquery)"

Tom Lane Fri, 15 Jul 2005 10:22:05 -0700

Luca Pireddu <[EMAIL PROTECTED]> writes:
> On July 15, 2005 08:58, Tom Lane wrote:
>> Ah-hah: this one is the fault of create_unique_path, which quoth


> In any case, it looks like Tom has already found the problem :-)  Thanks guys!

On closer analysis, the test in create_unique_path is almost but not
quite completely wrong :-(.  Here is the patch against 8.0 branch,
if you need it right away.

                        regards, tom lane


Index: src/backend/optimizer/util/pathnode.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v
retrieving revision 1.111
diff -c -r1.111 pathnode.c
*** src/backend/optimizer/util/pathnode.c       31 Dec 2004 22:00:23 -0000      
1.111
--- src/backend/optimizer/util/pathnode.c       15 Jul 2005 17:03:06 -0000
***************
*** 34,40 ****
  #include "utils/syscache.h"
  
  
! static bool is_distinct_query(Query *query);
  static bool hash_safe_tlist(List *tlist);
  
  
--- 34,41 ----
  #include "utils/syscache.h"
  
  
! static List *translate_sub_tlist(List *tlist, int relid);
! static bool query_is_distinct_for(Query *query, List *colnos);
  static bool hash_safe_tlist(List *tlist);
  
  
***************
*** 642,655 ****
        pathnode->subpath = subpath;
  
        /*
!        * If the input is a subquery whose output must be unique already, we
!        * don't need to do anything.
         */
!       if (rel->rtekind == RTE_SUBQUERY)
        {
                RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);
  
!               if (is_distinct_query(rte->subquery))
                {
                        pathnode->umethod = UNIQUE_PATH_NOOP;
                        pathnode->rows = rel->rows;
--- 643,683 ----
        pathnode->subpath = subpath;
  
        /*
!        * Try to identify the targetlist that will actually be unique-ified.
!        * In current usage, this routine is only used for sub-selects of IN
!        * clauses, so we should be able to find the tlist in in_info_list.
!        */
!       sub_targetlist = NIL;
!       foreach(l, root->in_info_list)
!       {
!               InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
! 
!               if (bms_equal(ininfo->righthand, rel->relids))
!               {
!                       sub_targetlist = ininfo->sub_targetlist;
!                       break;
!               }
!       }
! 
!       /*
!        * If the input is a subquery whose output must be unique already,
!        * then we don't need to do anything.  The test for uniqueness has
!        * to consider exactly which columns we are extracting; for example
!        * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
!        * So we cannot check for this optimization unless we found our own
!        * targetlist above, and it consists only of simple Vars referencing
!        * subquery outputs.  (Possibly we could do something with expressions
!        * in the subquery outputs, too, but for now keep it simple.)
         */
!       if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
        {
                RangeTblEntry *rte = rt_fetch(rel->relid, root->rtable);
+               List   *sub_tlist_colnos;
  
!               sub_tlist_colnos = translate_sub_tlist(sub_targetlist, 
rel->relid);
! 
!               if (sub_tlist_colnos &&
!                       query_is_distinct_for(rte->subquery, sub_tlist_colnos))
                {
                        pathnode->umethod = UNIQUE_PATH_NOOP;
                        pathnode->rows = rel->rows;
***************
*** 664,686 ****
        }
  
        /*
-        * Try to identify the targetlist that will actually be unique-ified.
-        * In current usage, this routine is only used for sub-selects of IN
-        * clauses, so we should be able to find the tlist in in_info_list.
-        */
-       sub_targetlist = NIL;
-       foreach(l, root->in_info_list)
-       {
-               InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
- 
-               if (bms_equal(ininfo->righthand, rel->relids))
-               {
-                       sub_targetlist = ininfo->sub_targetlist;
-                       break;
-               }
-       }
- 
-       /*
         * If we know the targetlist, try to estimate number of result rows;
         * otherwise punt.
         */
--- 692,697 ----
***************
*** 755,804 ****
  }
  
  /*
!  * is_distinct_query - does query never return duplicate rows?
   */
! static bool
! is_distinct_query(Query *query)
  {
!       /* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
!       if (has_distinct_clause(query))
!               return true;
  
!       /* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
!       if (query->setOperations)
        {
!               SetOperationStmt *topop = (SetOperationStmt *) 
query->setOperations;
  
!               Assert(IsA(topop, SetOperationStmt));
!               Assert(topop->op != SETOP_NONE);
  
!               if (!topop->all)
                        return true;
        }
  
        /*
!        * GROUP BY guarantees uniqueness if all the grouped columns appear in
!        * the output.  In our implementation this means checking they are non
!        * resjunk columns.
         */
        if (query->groupClause)
        {
!               ListCell   *gl;
! 
!               foreach(gl, query->groupClause)
                {
!                       GroupClause *grpcl = (GroupClause *) lfirst(gl);
                        TargetEntry *tle = get_sortgroupclause_tle(grpcl,
                                                                                
                           query->targetList);
  
!                       if (tle->resdom->resjunk)
!                               break;
                }
!               if (!gl)                                /* got to the end? */
                        return true;
        }
  
        /*
         * XXX Are there any other cases in which we can easily see the result
         * must be distinct?
         */
--- 766,888 ----
  }
  
  /*
!  * translate_sub_tlist - get subquery column numbers represented by tlist
!  *
!  * The given targetlist should contain only Vars referencing the given relid.
!  * Extract their varattnos (ie, the column numbers of the subquery) and return
!  * as an integer List.
!  *
!  * If any of the tlist items is not a simple Var, we cannot determine whether
!  * the subquery's uniqueness condition (if any) matches ours, so punt and
!  * return NIL.
   */
! static List *
! translate_sub_tlist(List *tlist, int relid)
  {
!       List       *result = NIL;
!       ListCell   *l;
  
!       foreach(l, tlist)
        {
!               Var        *var = (Var *) lfirst(l);
  
!               if (!var || !IsA(var, Var) ||
!                       var->varno != relid)
!                       return NIL;                     /* punt */
  
!               result = lappend_int(result, var->varattno);
!       }
!       return result;
! }
! 
! /*
!  * query_is_distinct_for - does query never return duplicates of the
!  *            specified columns?
!  *
!  * colnos is an integer list of output column numbers (resno's).  We are
!  * interested in whether rows consisting of just these columns are certain
!  * to be distinct.
!  */
! static bool
! query_is_distinct_for(Query *query, List *colnos)
! {
!       ListCell   *l;
! 
!       /*
!        * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
!        * columns in the DISTINCT clause appear in colnos.
!        */
!       if (query->distinctClause)
!       {
!               foreach(l, query->distinctClause)
!               {
!                       SortClause *scl = (SortClause *) lfirst(l);
!                       TargetEntry *tle = get_sortgroupclause_tle(scl,
!                                                                               
                           query->targetList);
! 
!                       if (!list_member_int(colnos, tle->resdom->resno))
!                               break;                  /* exit early if no 
match */
!               }
!               if (l == NULL)                  /* had matches for all? */
                        return true;
        }
  
        /*
!        * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
!        * appear in colnos.
         */
        if (query->groupClause)
        {
!               foreach(l, query->groupClause)
                {
!                       GroupClause *grpcl = (GroupClause *) lfirst(l);
                        TargetEntry *tle = get_sortgroupclause_tle(grpcl,
                                                                                
                           query->targetList);
  
!                       if (!list_member_int(colnos, tle->resdom->resno))
!                               break;                  /* exit early if no 
match */
                }
!               if (l == NULL)                  /* had matches for all? */
!                       return true;
!       }
!       else
!       {
!               /*
!                * If we have no GROUP BY, but do have aggregates or HAVING, 
then
!                * the result is at most one row so it's surely unique.
!                */
!               if (query->hasAggs || query->havingQual)
                        return true;
        }
  
        /*
+        * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output 
row,
+        * except with ALL
+        */
+       if (query->setOperations)
+       {
+               SetOperationStmt *topop = (SetOperationStmt *) 
query->setOperations;
+ 
+               Assert(IsA(topop, SetOperationStmt));
+               Assert(topop->op != SETOP_NONE);
+ 
+               if (!topop->all)
+               {
+                       /* We're good if all the nonjunk output columns are in 
colnos */
+                       foreach(l, query->targetList)
+                       {
+                               TargetEntry *tle = (TargetEntry *) lfirst(l);
+ 
+                               if (!tle->resdom->resjunk &&
+                                       !list_member_int(colnos, 
tle->resdom->resno))
+                                       break;          /* exit early if no 
match */
+                       }
+                       if (l == NULL)          /* had matches for all? */
+                               return true;
+               }
+       }
+ 
+       /*
         * XXX Are there any other cases in which we can easily see the result
         * must be distinct?
         */
Index: src/test/regress/expected/subselect.out
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/expected/subselect.out,v
retrieving revision 1.10.4.2
diff -c -r1.10.4.2 subselect.out
*** src/test/regress/expected/subselect.out     1 Feb 2005 23:09:00 -0000       
1.10.4.2
--- src/test/regress/expected/subselect.out     15 Jul 2005 17:03:06 -0000
***************
*** 203,208 ****
--- 203,265 ----
  (1 row)
  
  --
+ -- Test cases to check for overenthusiastic optimization of
+ -- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+ -- Luca Pireddu and Michael Fuhr.
+ --
+ CREATE TEMP TABLE foo (id integer);
+ CREATE TEMP TABLE bar (id1 integer, id2 integer);
+ INSERT INTO foo VALUES (1);
+ INSERT INTO bar VALUES (1, 1);
+ INSERT INTO bar VALUES (2, 2);
+ INSERT INTO bar VALUES (3, 1);
+ -- These cases require an extra level of distinct-ing above subquery s
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                       SELECT id1, id2 FROM bar) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ -- These cases do not
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                       SELECT id2 FROM bar) AS s);
+  id 
+ ----
+   1
+ (1 row)
+ 
+ --
  -- Test case to catch problems with multiply nested sub-SELECTs not getting
  -- recalculated properly.  Per bug report from Didier Moens.
  --
Index: src/test/regress/sql/subselect.sql
===================================================================
RCS file: /cvsroot/pgsql/src/test/regress/sql/subselect.sql,v
retrieving revision 1.7
diff -c -r1.7 subselect.sql
*** src/test/regress/sql/subselect.sql  4 Oct 2004 14:42:48 -0000       1.7
--- src/test/regress/sql/subselect.sql  15 Jul 2005 17:03:06 -0000
***************
*** 96,101 ****
--- 96,134 ----
     where unique1 IN (select distinct hundred from tenk1 b)) ss;
  
  --
+ -- Test cases to check for overenthusiastic optimization of
+ -- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+ -- Luca Pireddu and Michael Fuhr.
+ --
+ 
+ CREATE TEMP TABLE foo (id integer);
+ CREATE TEMP TABLE bar (id1 integer, id2 integer);
+ 
+ INSERT INTO foo VALUES (1);
+ 
+ INSERT INTO bar VALUES (1, 1);
+ INSERT INTO bar VALUES (2, 2);
+ INSERT INTO bar VALUES (3, 1);
+ 
+ -- These cases require an extra level of distinct-ing above subquery s
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                       SELECT id1, id2 FROM bar) AS s);
+ 
+ -- These cases do not
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ SELECT * FROM foo WHERE id IN
+     (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                       SELECT id2 FROM bar) AS s);
+ 
+ --
  -- Test case to catch problems with multiply nested sub-SELECTs not getting
  -- recalculated properly.  Per bug report from Didier Moens.
  --

---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster

Re: [SQL] problem (bug?) with "in (subquery)"

Reply via email to