Re: Limit memory usage by postgres_fdw batches

Alexander Pyhalov Tue, 13 Jan 2026 02:45:08 -0800

Hi.

I've looked at the third patch more and found some evident issues.

1) While using tuplestore we get too much garbage from tuple conversion,which was not cleared properly. Tried to fix it, but now we come to thesecond problem.

2) While receiving tuples we've already allocated memory in libpq, so itcan be too late to do something. In simple examples I can see libpqresult sets which have several GB size (and likely are not limited byanything).


I've used the attached script to create some tables.

select create_partitioned_rel('t1', 128, true, 1);
select create_partitioned_rel('t2', 4, true, 100);

insert into t1 select i, pg_read_binary_file('/some/100mb/file') fromgenerate_series(1,128) i;insert into t2 select i, pg_read_binary_file('/some/100mb/file') fromgenerate_series(1,128) i;


And with simple queries like

select sum(length(s)) from TABLE;

we can see that backend can easily consume up to several GBs of RAM.

For now I start thinking we need some form of FETCH, which stopsfetching data based on batch size...

--
Best regards,
Alexander Pyhalov,
Postgres Professional

SELECT current_database() AS current_database,
  current_setting('port') AS current_port
\gset


CREATE EXTENSION postgres_fdw;
CREATE SERVER loopback1 FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (dbname :'current_database', port :'current_port', 
async_capable 'on', fetch_size '1');
CREATE SERVER loopback2 FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (dbname :'current_database', port :'current_port', 
async_capable 'on', fetch_size '1');
CREATE SERVER loopback3 FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (dbname :'current_database', port :'current_port', 
async_capable 'on', fetch_size '1');
CREATE SERVER loopback4 FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (dbname :'current_database', port :'current_port', 
async_capable 'on', fetch_size '1');
CREATE USER MAPPING FOR CURRENT_USER SERVER loopback1;
CREATE USER MAPPING FOR CURRENT_USER SERVER loopback2;
CREATE USER MAPPING FOR CURRENT_USER SERVER loopback3;
CREATE USER MAPPING FOR CURRENT_USER SERVER loopback4;


create or replace function create_partitioned_rel(relname text, nparts int, 
foreign_parts bool, fetch_size int) returns void as $$
declare 
i int;
sql text;
part_name text;
fdw_part_name text;
begin
     execute format('create table %I (i int, s bytea) partition by hash(i)', 
relname);
     for i in 0..nparts-1 loop
          if foreign_parts then
               part_name := format('%s_%s', relname, i);
               fdw_part_name := format('%s_%s_fdw', relname, i);
               execute format('create foreign table %I partition of %I for 
values with (modulus %s, remainder %s) server loopback%s options (table_name 
%L, fetch_size %L)', fdw_part_name, relname, nparts, i, i%4+1, part_name, 
fetch_size);
               execute format('create table %I (i int, s bytea)', part_name);   
                    
          else
               part_name := format('%s_%s', relname, i);
               execute format('create table %I partition of %I for values with 
(modulus %s, remainder %s)', part_name, relname, nparts, i);
          end if;
     end loop;
end 
$$ language plpgsql;

From 7efc88ffb4d90e935ca1e17ac3d4c753a2de4ef0 Mon Sep 17 00:00:00 2001
From: Alexander Pyhalov <[email protected]>
Date: Tue, 16 Dec 2025 18:46:27 +0300
Subject: [PATCH 1/3] Limit batch_size for foreign insert with work_mem

Option batch_size can be set on foreign server level. It can be very optimistic
even for one table with different tuple lengths. To prevent large memory usage
limit effective batch size with work_mem.
---
 src/backend/executor/nodeModifyTable.c | 15 ++++++++++++---
 src/include/nodes/execnodes.h          |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 874b71e6608..a28beca63ff 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -943,12 +943,18 @@ ExecInsert(ModifyTableContext *context,
 		if (resultRelInfo->ri_BatchSize > 1)
 		{
 			bool		flushed = false;
+			Size		tuple_len;
+
+			/* Compute length of the current tuple */
+			slot_getallattrs(slot);
+			tuple_len = heap_compute_data_size(slot->tts_tupleDescriptor, slot->tts_values, slot->tts_isnull);
 
 			/*
-			 * When we've reached the desired batch size, perform the
-			 * insertion.
+			 * When we've reached the desired batch size or exceeded work_mem,
+			 * perform the insertion.
 			 */
-			if (resultRelInfo->ri_NumSlots == resultRelInfo->ri_BatchSize)
+			if (resultRelInfo->ri_NumSlots == resultRelInfo->ri_BatchSize ||
+				((tuple_len + resultRelInfo->ri_BatchMemoryUsed > work_mem * 1024) && (resultRelInfo->ri_NumSlots > 0)))
 			{
 				ExecBatchInsert(mtstate, resultRelInfo,
 								resultRelInfo->ri_Slots,
@@ -1019,6 +1025,8 @@ ExecInsert(ModifyTableContext *context,
 								   resultRelInfo));
 
 			resultRelInfo->ri_NumSlots++;
+			/* Batch size can grow up to tuple length even if it exceeds work_mem. */
+			resultRelInfo->ri_BatchMemoryUsed += tuple_len;
 
 			MemoryContextSwitchTo(oldContext);
 
@@ -1418,6 +1426,7 @@ ExecBatchInsert(ModifyTableState *mtstate,
 		ExecClearTuple(planSlots[i]);
 	}
 	resultRelInfo->ri_NumSlots = 0;
+	resultRelInfo->ri_BatchMemoryUsed = 0;
 }
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3968429f991..b82ec938228 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -542,6 +542,7 @@ typedef struct ResultRelInfo
 	int			ri_NumSlots;	/* number of slots in the array */
 	int			ri_NumSlotsInitialized; /* number of initialized slots */
 	int			ri_BatchSize;	/* max slots inserted in a single batch */
+	int			ri_BatchMemoryUsed;	/* memory used by batch */
 	TupleTableSlot **ri_Slots;	/* input tuples for batch insert */
 	TupleTableSlot **ri_PlanSlots;
 
-- 
2.43.0

From f9a2ec48d907c37259fc6658a6a0ea5925143a42 Mon Sep 17 00:00:00 2001
From: Alexander Pyhalov <[email protected]>
Date: Tue, 30 Dec 2025 10:35:22 +0300
Subject: [PATCH 2/3] Fall back to foreign insert when tuple is too big

When tuple is more then work_mem, we don't form batch,
but fall through to usual foreign insert. This means
that executor can form batch, insert it and finish with
a usual insert, so there will be no pending batch inserts
to process. To avoid manipulation with possibly long lists,
we don't clear es_insert_pending_result_relations and
es_insert_pending_modifytables lists, but check in
ExecPendingInserts() if there are any pending tuples.
---
 .../postgres_fdw/expected/postgres_fdw.out    |  11 ++
 contrib/postgres_fdw/sql/postgres_fdw.sql     |   9 ++
 src/backend/executor/nodeModifyTable.c        | 138 ++++++++++--------
 src/include/nodes/execnodes.h                 |   1 +
 4 files changed, 96 insertions(+), 63 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 48e3185b227..a8bfec9413c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -7640,6 +7640,17 @@ select count(*) from tab_batch_sharded;
     45
 (1 row)
 
+delete from tab_batch_sharded;
+-- test batch insert with large tuples and switching from batch to usual insert
+set work_mem to 64;
+insert into tab_batch_sharded select 3, case when i%4 = 0 then lpad('a',65*1024,'a') else 'test'|| i end from generate_series(1, 100) i;
+select count(*) from tab_batch_sharded;
+ count 
+-------
+   100
+(1 row)
+
+reset work_mem;
 drop table tab_batch_local;
 drop table tab_batch_sharded;
 drop table tab_batch_sharded_p1_remote;
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 9a8f9e28135..467943ca32e 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -1936,10 +1936,19 @@ create foreign table tab_batch_sharded_p1 partition of tab_batch_sharded
   server loopback options (table_name 'tab_batch_sharded_p1_remote');
 insert into tab_batch_sharded select * from tab_batch_local;
 select count(*) from tab_batch_sharded;
+delete from tab_batch_sharded;
+-- test batch insert with large tuples and switching from batch to usual insert
+set work_mem to 64;
+insert into tab_batch_sharded select 3, case when i%4 = 0 then lpad('a',65*1024,'a') else 'test'|| i end from generate_series(1, 100) i;
+select count(*) from tab_batch_sharded;
+reset work_mem;
+
 drop table tab_batch_local;
 drop table tab_batch_sharded;
 drop table tab_batch_sharded_p1_remote;
 
+
+
 alter server loopback options (drop batch_size);
 
 -- ===================================================================
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a28beca63ff..2e6dcdff7bf 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -942,7 +942,6 @@ ExecInsert(ModifyTableContext *context,
 		 */
 		if (resultRelInfo->ri_BatchSize > 1)
 		{
-			bool		flushed = false;
 			Size		tuple_len;
 
 			/* Compute length of the current tuple */
@@ -961,76 +960,82 @@ ExecInsert(ModifyTableContext *context,
 								resultRelInfo->ri_PlanSlots,
 								resultRelInfo->ri_NumSlots,
 								estate, canSetTag);
-				flushed = true;
 			}
 
-			oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
-
-			if (resultRelInfo->ri_Slots == NULL)
+			if (tuple_len < work_mem * 1024)
 			{
-				resultRelInfo->ri_Slots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
-				resultRelInfo->ri_PlanSlots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
-			}
+				oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
 
-			/*
-			 * Initialize the batch slots. We don't know how many slots will
-			 * be needed, so we initialize them as the batch grows, and we
-			 * keep them across batches. To mitigate an inefficiency in how
-			 * resource owner handles objects with many references (as with
-			 * many slots all referencing the same tuple descriptor) we copy
-			 * the appropriate tuple descriptor for each slot.
-			 */
-			if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
-			{
-				TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
-				TupleDesc	plan_tdesc =
-					CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
+				if (resultRelInfo->ri_Slots == NULL)
+				{
+					resultRelInfo->ri_Slots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
+					resultRelInfo->ri_PlanSlots = palloc_array(TupleTableSlot *, resultRelInfo->ri_BatchSize);
+				}
 
-				resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
-					MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
+				/*
+				 * Initialize the batch slots. We don't know how many slots
+				 * will be needed, so we initialize them as the batch grows,
+				 * and we keep them across batches. To mitigate an
+				 * inefficiency in how resource owner handles objects with
+				 * many references (as with many slots all referencing the
+				 * same tuple descriptor) we copy the appropriate tuple
+				 * descriptor for each slot.
+				 */
+				if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
+				{
+					TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+					TupleDesc	plan_tdesc =
+						CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
 
-				resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
-					MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
+					resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
+						MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
 
-				/* remember how many batch slots we initialized */
-				resultRelInfo->ri_NumSlotsInitialized++;
-			}
+					resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
+						MakeSingleTupleTableSlot(plan_tdesc, planSlot->tts_ops);
 
-			ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
-						 slot);
+					/* remember how many batch slots we initialized */
+					resultRelInfo->ri_NumSlotsInitialized++;
+				}
 
-			ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
-						 planSlot);
+				ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
+							 slot);
 
-			/*
-			 * If these are the first tuples stored in the buffers, add the
-			 * target rel and the mtstate to the
-			 * es_insert_pending_result_relations and
-			 * es_insert_pending_modifytables lists respectively, except in
-			 * the case where flushing was done above, in which case they
-			 * would already have been added to the lists, so no need to do
-			 * this.
-			 */
-			if (resultRelInfo->ri_NumSlots == 0 && !flushed)
-			{
-				Assert(!list_member_ptr(estate->es_insert_pending_result_relations,
-										resultRelInfo));
-				estate->es_insert_pending_result_relations =
-					lappend(estate->es_insert_pending_result_relations,
-							resultRelInfo);
-				estate->es_insert_pending_modifytables =
-					lappend(estate->es_insert_pending_modifytables, mtstate);
-			}
-			Assert(list_member_ptr(estate->es_insert_pending_result_relations,
-								   resultRelInfo));
+				ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
+							 planSlot);
 
-			resultRelInfo->ri_NumSlots++;
-			/* Batch size can grow up to tuple length even if it exceeds work_mem. */
-			resultRelInfo->ri_BatchMemoryUsed += tuple_len;
+				/*
+				 * If these are the first tuples stored in the buffers, add
+				 * the target rel and the mtstate to the
+				 * es_insert_pending_result_relations and
+				 * es_insert_pending_modifytables lists respectively, except
+				 * in the case where flushing was done above, in which case
+				 * they would already have been added to the lists, so no need
+				 * to do this.
+				 */
+				if (resultRelInfo->ri_NumSlots == 0 && !resultRelInfo->ri_ExecutorPendingStateModified)
+				{
+					Assert(!list_member_ptr(estate->es_insert_pending_result_relations,
+											resultRelInfo));
+					estate->es_insert_pending_result_relations =
+						lappend(estate->es_insert_pending_result_relations,
+								resultRelInfo);
+					estate->es_insert_pending_modifytables =
+						lappend(estate->es_insert_pending_modifytables, mtstate);
+					resultRelInfo->ri_ExecutorPendingStateModified = true;
+				}
+				Assert(list_member_ptr(estate->es_insert_pending_result_relations,
+									   resultRelInfo));
 
-			MemoryContextSwitchTo(oldContext);
+				resultRelInfo->ri_NumSlots++;
+				resultRelInfo->ri_BatchMemoryUsed += tuple_len;
+				/* We've flushed batch if it is too big. */
+				Assert(resultRelInfo->ri_BatchMemoryUsed < work_mem * 1024);
 
-			return NULL;
+				MemoryContextSwitchTo(oldContext);
+
+				return NULL;
+			}
+			/* else do usual foreign insert */
 		}
 
 		/*
@@ -1445,11 +1450,18 @@ ExecPendingInserts(EState *estate)
 		ModifyTableState *mtstate = (ModifyTableState *) lfirst(l2);
 
 		Assert(mtstate);
-		ExecBatchInsert(mtstate, resultRelInfo,
-						resultRelInfo->ri_Slots,
-						resultRelInfo->ri_PlanSlots,
-						resultRelInfo->ri_NumSlots,
-						estate, mtstate->canSetTag);
+
+		/*
+		 * Batch insert could switch to non-batched insert, in this case we
+		 * could have no filled slots.
+		 */
+		if (resultRelInfo->ri_NumSlots > 0)
+			ExecBatchInsert(mtstate, resultRelInfo,
+							resultRelInfo->ri_Slots,
+							resultRelInfo->ri_PlanSlots,
+							resultRelInfo->ri_NumSlots,
+							estate, mtstate->canSetTag);
+		resultRelInfo->ri_ExecutorPendingStateModified = false;
 	}
 
 	list_free(estate->es_insert_pending_result_relations);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b82ec938228..dcc23774a4c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -543,6 +543,7 @@ typedef struct ResultRelInfo
 	int			ri_NumSlotsInitialized; /* number of initialized slots */
 	int			ri_BatchSize;	/* max slots inserted in a single batch */
 	int			ri_BatchMemoryUsed;	/* memory used by batch */
+	bool			ri_ExecutorPendingStateModified; /* true if member of estate->es_insert_pending_result_relations */
 	TupleTableSlot **ri_Slots;	/* input tuples for batch insert */
 	TupleTableSlot **ri_PlanSlots;
 
-- 
2.43.0

From 1d89167cbb6b13bd3e73348fd6d8e8cd33c180ea Mon Sep 17 00:00:00 2001
From: Alexander Pyhalov <[email protected]>
Date: Thu, 18 Dec 2025 09:17:42 +0300
Subject: [PATCH 3/3] WIP: Use tuplestore in PgFdwScanState scan state to limit
 memory usage

A tuplestore doesn't preserve ctids, so we need to store them
separately.
---
 contrib/postgres_fdw/postgres_fdw.c | 82 ++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 5e178c21b39..f160f1a3f62 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -158,8 +158,12 @@ typedef struct PgFdwScanState
 	const char **param_values;	/* textual values of query parameters */
 
 	/* for storing result tuples */
-	HeapTuple  *tuples;			/* array of currently-retrieved tuples */
-	int			num_tuples;		/* # of tuples in array */
+	Tuplestorestate *result_store;	/* currently-retrieved tuples */
+	ItemPointerData *ctids;		/* separate store for ctids */
+
+	TupleTableSlot *tupstore_slot;	/* slot needed for retrieving tuples from
+									 * tuplestore */
+	int			num_tuples;		/* # of tuples in tuple store */
 	int			next_tuple;		/* index of next one to return */
 
 	/* batch-level state, for optimizing rewinds and avoiding useless fetch */
@@ -546,6 +550,8 @@ static void merge_fdw_options(PgFdwRelationInfo *fpinfo,
 							  const PgFdwRelationInfo *fpinfo_i);
 static int	get_batch_size_option(Relation rel);
 
+static void clean_scan_state_result_store(PgFdwScanState *fsstate);
+
 
 /*
  * Foreign-data wrapper handler function: return a struct with pointers
@@ -1605,6 +1611,8 @@ postgresIterateForeignScan(ForeignScanState *node)
 {
 	PgFdwScanState *fsstate = (PgFdwScanState *) node->fdw_state;
 	TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
+	HeapTuple	newtup;
+	MemoryContext oldcontext;
 
 	/*
 	 * In sync mode, if this is the first call after Begin or ReScan, we need
@@ -1631,13 +1639,37 @@ postgresIterateForeignScan(ForeignScanState *node)
 			return ExecClearTuple(slot);
 	}
 
+	MemoryContextReset(fsstate->temp_cxt);
+	oldcontext = MemoryContextSwitchTo(fsstate->temp_cxt);
+	if (!tuplestore_gettupleslot(fsstate->result_store, true, false,
+								 fsstate->tupstore_slot))
+	{
+		/*
+		 * We've checked that next_tuple is less than fsstate->num_tuples, so
+		 * there should be some result
+		 */
+		MemoryContextSwitchTo(oldcontext);
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("unexpected end of store")));
+
+	}
+
+	newtup = ExecFetchSlotHeapTuple(fsstate->tupstore_slot, true, NULL);
+	/* Tuple store doesn't preserve ctid, so restore it separately */
+	newtup->t_self = newtup->t_data->t_ctid = fsstate->ctids[fsstate->next_tuple];
+	ExecClearTuple(fsstate->tupstore_slot);
+
 	/*
 	 * Return the next tuple.
 	 */
-	ExecStoreHeapTuple(fsstate->tuples[fsstate->next_tuple++],
+	ExecStoreHeapTuple(newtup,
 					   slot,
 					   false);
 
+	fsstate->next_tuple++;
+	MemoryContextSwitchTo(oldcontext);
+
 	return slot;
 }
 
@@ -1699,6 +1731,8 @@ postgresReScanForeignScan(ForeignScanState *node)
 	{
 		/* Easy: just rescan what we already have in memory, if anything */
 		fsstate->next_tuple = 0;
+		if (fsstate->result_store)
+			tuplestore_rescan(fsstate->result_store);
 		return;
 	}
 
@@ -1708,7 +1742,7 @@ postgresReScanForeignScan(ForeignScanState *node)
 	PQclear(res);
 
 	/* Now force a fresh FETCH. */
-	fsstate->tuples = NULL;
+	clean_scan_state_result_store(fsstate);
 	fsstate->num_tuples = 0;
 	fsstate->next_tuple = 0;
 	fsstate->fetch_ct_2 = 0;
@@ -1737,6 +1771,8 @@ postgresEndForeignScan(ForeignScanState *node)
 	ReleaseConnection(fsstate->conn);
 	fsstate->conn = NULL;
 
+	clean_scan_state_result_store(fsstate);
+
 	/* MemoryContexts will be deleted automatically. */
 }
 
@@ -3781,7 +3817,8 @@ create_cursor(ForeignScanState *node)
 
 	/* Mark the cursor as created, and show no tuples have been retrieved */
 	fsstate->cursor_exists = true;
-	fsstate->tuples = NULL;
+	clean_scan_state_result_store(fsstate);
+
 	fsstate->num_tuples = 0;
 	fsstate->next_tuple = 0;
 	fsstate->fetch_ct_2 = 0;
@@ -3808,7 +3845,9 @@ fetch_more_data(ForeignScanState *node)
 	 * We'll store the tuples in the batch_cxt.  First, flush the previous
 	 * batch.
 	 */
-	fsstate->tuples = NULL;
+	clean_scan_state_result_store(fsstate);
+
+	MemoryContextReset(fsstate->temp_cxt);
 	MemoryContextReset(fsstate->batch_cxt);
 	oldcontext = MemoryContextSwitchTo(fsstate->batch_cxt);
 
@@ -3844,21 +3883,29 @@ fetch_more_data(ForeignScanState *node)
 
 	/* Convert the data into HeapTuples */
 	numrows = PQntuples(res);
-	fsstate->tuples = (HeapTuple *) palloc0(numrows * sizeof(HeapTuple));
+	fsstate->result_store = tuplestore_begin_heap(false, false, work_mem);
+	fsstate->tupstore_slot = MakeSingleTupleTableSlot(fsstate->tupdesc, &TTSOpsMinimalTuple);
 	fsstate->num_tuples = numrows;
 	fsstate->next_tuple = 0;
+	fsstate->ctids = palloc0(numrows * sizeof(ItemPointerData));
 
 	for (i = 0; i < numrows; i++)
 	{
+		HeapTuple	newtup;
+
 		Assert(IsA(node->ss.ps.plan, ForeignScan));
 
-		fsstate->tuples[i] =
+		newtup =
 			make_tuple_from_result_row(res, i,
 									   fsstate->rel,
 									   fsstate->attinmeta,
 									   fsstate->retrieved_attrs,
 									   node,
 									   fsstate->temp_cxt);
+
+		tuplestore_puttuple(fsstate->result_store, newtup);
+		ItemPointerCopy(&newtup->t_self, &fsstate->ctids[i]);
+		heap_freetuple(newtup);
 	}
 
 	/* Update fetch_ct_2 */
@@ -7636,6 +7683,25 @@ make_tuple_from_result_row(PGresult *res,
 	return tuple;
 }
 
+/*
+ * Clean PgFdwScanState result store
+ */
+static void
+clean_scan_state_result_store(PgFdwScanState *fsstate)
+{
+	if (fsstate->result_store)
+	{
+		/*
+		 * We partially rely on the fact that batch_cxt is also reset
+		 */
+		tuplestore_end(fsstate->result_store);
+		ExecDropSingleTupleTableSlot(fsstate->tupstore_slot);
+		fsstate->result_store = NULL;
+		fsstate->tupstore_slot = NULL;
+		fsstate->ctids = NULL;
+	}
+}
+
 /*
  * Callback function which is called when error occurs during column value
  * conversion.  Print names of column and relation.
-- 
2.43.0

Re: Limit memory usage by postgres_fdw batches

Reply via email to