Re: Reduce planning time for large NOT IN lists containing NULL

Ilia Evdokimov Tue, 24 Feb 2026 13:14:45 -0800

On 2/24/26 11:29, David Geier wrote:

Using array_contains_nulls() seems fine. In case the IN list doesn't
contain NULL, the function can immediately bail thanks to the
!ARR_HASNULL() check in the beginning.


It only needs to iterate over the NULL-bitmap, if it exists. This is the
case if there's actually a NULL element in the array, or if the array
initially contained NULL and all NULLs got removed subsequently.

If we ever find the latter case to matter we could remove the
NULL-bitmap in array_set_element() / array_set_element_expanded(), when
the last NULL element got removed.

Could you clarify what exactly this additional test meant to verify?

Zsolt's test case creates an array that initially contains NULL. The
NULL element is subsequently replaced by a non-NULL value but
array_set_element_expanded() keeps the NULL-bitmap around. With that,
your ARR_ISNULL() check bails and causes the selectivity estimation to
incorrectly return 0.


Ah, right - thanks for the clarification. I agree.

Regarding the regression test: the suggestion test case is good, butthere is not a straightforward way to expose the estimated row countwithout also showing the costs, and costs are unstable. To avoid that, Ireused the parsing approach already present in stats_ext.sql to extractonly the estimated row count from EXPLAIN.

Since the test table contains exactly 1000 rows and we run ANALYZE, allrows are included in the statistics sample. Therefore the estimate for x<> ALL(array[1, 99, 2]) is deterministically 997 rows, and the teststable and ensures we detect the incorrect early-zero estimate.


Let me know if you'd prefer a different approach. I've attached v4 patch.

--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC,
https://tantorlabs.com/

From a9dd1d1c9990a30173c5210570efccb034e8b5f4 Mon Sep 17 00:00:00 2001
From: Evdokimov Ilia <[email protected]>
Date: Wed, 25 Feb 2026 00:08:26 +0300
Subject: [PATCH v4] Reduce planning time for large NOT IN lists containing 
 NULL

For x <> ALL (...) / x NOT IN (...), the presence of a NULL element
makes the selectivity 0.0.

The planner currently still iterates over all elements and computes
per-element selectivity, even though the final result is known.

Add an early NULL check for constant arrays and immediately return
0.0 under ALL semantics.

This reduces planning time for large NOT IN / <> ALL lists without
changing semantics.
---
 src/backend/utils/adt/selfuncs.c          |  9 +++++
 src/test/regress/expected/expressions.out | 44 +++++++++++++++++++++++
 src/test/regress/sql/expressions.sql      | 41 +++++++++++++++++++++
 3 files changed, 94 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 29fec655593..eef3f0375a5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2018,6 +2018,11 @@ scalararraysel(PlannerInfo *root,
 		if (arrayisnull)		/* qual can't succeed if null array */
 			return (Selectivity) 0.0;
 		arrayval = DatumGetArrayTypeP(arraydatum);
+
+		/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+		if (!useOr && array_contains_nulls(arrayval))
+			return (Selectivity) 0.0;
+
 		get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
 							 &elmlen, &elmbyval, &elmalign);
 		deconstruct_array(arrayval,
@@ -2115,6 +2120,10 @@ scalararraysel(PlannerInfo *root,
 			List	   *args;
 			Selectivity s2;
 
+			/* Selectivity of "WHERE x NOT IN (NULL, ... )" is always 0 */
+			if (!useOr && IsA(elem, Const) && ((Const *) elem)->constisnull)
+				return (Selectivity) 0.0;
+
 			/*
 			 * Theoretically, if elem isn't of nominal_element_type we should
 			 * insert a RelabelType, but it seems unlikely that any operator
diff --git a/src/test/regress/expected/expressions.out b/src/test/regress/expected/expressions.out
index 9a3c97b15a3..34f14a5775a 100644
--- a/src/test/regress/expected/expressions.out
+++ b/src/test/regress/expected/expressions.out
@@ -426,3 +426,47 @@ select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myi
 (0 rows)
 
 rollback;
+-- Test <> ALL when array initially contained NULL but no longer does
+begin;
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+ estimated 
+-----------
+       997
+(1 row)
+
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+ estimated 
+-----------
+       997
+(1 row)
+
+rollback;
diff --git a/src/test/regress/sql/expressions.sql b/src/test/regress/sql/expressions.sql
index e02c21f3368..ca94859bbf8 100644
--- a/src/test/regress/sql/expressions.sql
+++ b/src/test/regress/sql/expressions.sql
@@ -209,3 +209,44 @@ select * from inttest where a not in (1::myint,2::myint,3::myint,4::myint,5::myi
 select * from inttest where a not in (0::myint,2::myint,3::myint,4::myint,5::myint, null);
 
 rollback;
+
+-- Test <> ALL when array initially contained NULL but no longer does
+
+begin;
+
+create function check_estimated_rows(text) returns table (estimated int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*)');
+            return query select tmp[1]::int;
+        end if;
+    end loop;
+end;
+$$;
+
+create function replace_elem(arr int[], idx int, val int)
+returns int[] AS $$
+begin
+      arr[idx] := val;
+      return arr;
+end;
+$$ language plpgsql immutable;
+
+create table notin_test as select generate_series(1, 1000) as x;
+analyze notin_test;
+
+select * from check_estimated_rows('select * from notin_test where x <> all(array[1,99,3])');
+-- same array, constructed from an array with a NULL
+select * from check_estimated_rows('select * from notin_test where x <> all(replace_elem(array[1,null,3], 2, 99))');
+
+rollback;
\ No newline at end of file
-- 
2.34.1

Re: Reduce planning time for large NOT IN lists containing NULL

Reply via email to