On Fri, Jul 21, 2017 at 4:10 AM, Thomas Munro
<thomas.mu...@enterprisedb.com> wrote:
>
> Thanks.  Bearing all that in mind, I ran through a series of test
> scenarios and discovered that my handling for JOIN_ANTI was wrong: I
> thought that I had to deal with inverting the result, but I now see
> that that's handled elsewhere (calc_joinrel_size_estimate() I think).
> So neqjoinsel should just treat JOIN_SEMI and JOIN_ANTI exactly the
> same way.

I agree, esp. after looking at eqjoinsel_semi(), which is used for
both semi and anti joins, it becomes more clear.

>
> That just leaves the question of whether we should try to handle the
> empty RHS and single-value RHS cases using statistics.  My intuition
> is that we shouldn't, but I'll be happy to change my intuition and
> code that up if that is the feedback from planner gurus.

Empty RHS can result from dummy relations also, which are produced by
constraint exclusion, so may be that's an interesting case. Single
value RHS may be interesting with partitioned table with all rows in a
given partition end up with the same partition key value. But may be
those are just different patches. I am not sure.

>
> Please find attached a new version, and a test script I used, which
> shows a bunch of interesting cases.  I'll add this to the commitfest.

I added some "stable" tests to your patch taking inspiration from the
test SQL file. I think those will be stable across machines and runs.
Please let me know if those look good to you.



-- 
Best Wishes,
Ashutosh Bapat
EnterpriseDB Corporation
The Postgres Database Company
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 23e5526..4c1bae6 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2697,26 +2697,63 @@ neqjoinsel(PG_FUNCTION_ARGS)
 	Oid			eqop;
 	float8		result;
 
-	/*
-	 * We want 1 - eqjoinsel() where the equality operator is the one
-	 * associated with this != operator, that is, its negator.
-	 */
-	eqop = get_negator(operator);
-	if (eqop)
+
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
 	{
-		result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
-													PointerGetDatum(root),
-													ObjectIdGetDatum(eqop),
-													PointerGetDatum(args),
-													Int16GetDatum(jointype),
-													PointerGetDatum(sjinfo)));
+		VariableStatData leftvar;
+		VariableStatData rightvar;
+		double		nullfrac;
+		bool		reversed;
+		HeapTuple	statsTuple;
+
+		get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed);
+		statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple;
+		if (HeapTupleIsValid(statsTuple))
+			nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac;
+		else
+			nullfrac = 0.0;
+		ReleaseVariableStats(leftvar);
+		ReleaseVariableStats(rightvar);
+
+		/*
+		 * For semi-joins, if there is more than one distinct value in the RHS
+		 * relation then every non-null LHS row must find a row to join since
+		 * it can only be equal to one of them.  We'll assume that there is
+		 * always more than one distinct RHS value for the sake of stability,
+		 * though in theory we could have special cases for empty RHS
+		 * (selectivity = 0) and single-distinct-value RHS (selectivity =
+		 * fraction of LHS that has the same value as the single RHS value).
+		 *
+		 * For anti-joins, if we use the same assumption that there is more
+		 * than one distinct key in the RHS relation, then every non-null LHS
+		 * row must be suppressed by the anti-join leaving only nullfrac.
+		 */
+		result = 1.0 - nullfrac;
 	}
 	else
 	{
-		/* Use default selectivity (should we raise an error instead?) */
-		result = DEFAULT_EQ_SEL;
+		/*
+		 * We want 1 - eqjoinsel() where the equality operator is the one
+		 * associated with this != operator, that is, its negator.
+		 */
+		eqop = get_negator(operator);
+		if (eqop)
+		{
+			result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
+														PointerGetDatum(root),
+														ObjectIdGetDatum(eqop),
+														PointerGetDatum(args),
+														Int16GetDatum(jointype),
+														PointerGetDatum(sjinfo)));
+		}
+		else
+		{
+			/* Use default selectivity (should we raise an error instead?) */
+			result = DEFAULT_EQ_SEL;
+		}
+		result = 1.0 - result;
 	}
-	result = 1.0 - result;
+
 	PG_RETURN_FLOAT8(result);
 }
 
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 9f4c88d..10bfb68 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -1845,6 +1845,89 @@ SELECT '' AS "xxx", *
      | 1 | 4 | one | -1
 (1 row)
 
+-- SEMI and ANTI join neq selectivity
+ANALYZE J1_TBL;
+ANALYZE J2_TBL;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Nested Loop Semi Join (actual rows=9 loops=1)
+   Join Filter: (t1.i <> t2.i)
+   Rows Removed by Join Filter: 19
+   ->  Seq Scan on j1_tbl t1 (actual rows=11 loops=1)
+   ->  Materialize (actual rows=3 loops=11)
+         ->  Seq Scan on j2_tbl t2 (actual rows=9 loops=1)
+(6 rows)
+
+SELECT count(*) FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+ count 
+-------
+     9
+(1 row)
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Nested Loop Semi Join (actual rows=9 loops=1)
+   Join Filter: (t2.i <> t1.i)
+   Rows Removed by Join Filter: 19
+   ->  Seq Scan on j1_tbl t1 (actual rows=11 loops=1)
+   ->  Materialize (actual rows=3 loops=11)
+         ->  Seq Scan on j2_tbl t2 (actual rows=9 loops=1)
+(6 rows)
+
+SELECT count(*) FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+ count 
+-------
+     9
+(1 row)
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Nested Loop Anti Join (actual rows=2 loops=1)
+   Join Filter: (t1.i <> t2.i)
+   Rows Removed by Join Filter: 19
+   ->  Seq Scan on j1_tbl t1 (actual rows=11 loops=1)
+   ->  Materialize (actual rows=3 loops=11)
+         ->  Seq Scan on j2_tbl t2 (actual rows=9 loops=1)
+(6 rows)
+
+SELECT count(*) FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+ count 
+-------
+     2
+(1 row)
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Nested Loop Anti Join (actual rows=2 loops=1)
+   Join Filter: (t2.i <> t1.i)
+   Rows Removed by Join Filter: 19
+   ->  Seq Scan on j1_tbl t1 (actual rows=11 loops=1)
+   ->  Materialize (actual rows=3 loops=11)
+         ->  Seq Scan on j2_tbl t2 (actual rows=9 loops=1)
+(6 rows)
+
+SELECT count(*) FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+ count 
+-------
+     2
+(1 row)
+
 --
 -- More complicated constructs
 --
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 835d675..e97f57c 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -193,6 +193,32 @@ SELECT '' AS "xxx", *
 SELECT '' AS "xxx", *
   FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1);
 
+-- SEMI and ANTI join neq selectivity
+ANALYZE J1_TBL;
+ANALYZE J2_TBL;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+SELECT count(*) FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+SELECT count(*) FROM J1_TBL t1
+WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+SELECT count(*) FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i);
+
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF)
+SELECT * FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
+SELECT count(*) FROM J1_TBL t1
+WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i);
 
 --
 -- More complicated constructs
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to