Attached is a patch that adds the ability to dump table data in multiple chunks.

Looking for feedback at this point:
 1) what have I missed
 2) should I implement something to avoid single-page chunks

The flag --huge-table-chunk-pages which tells the directory format
dump to dump tables where the main fork has more pages than this in
multiple chunks of given number of pages,

The main use case is speeding up parallel dumps in case of one or a
small number of HUGE tables so parts of these can be dumped in
parallel.

It will also help in case the target file system has some limitations
on file sizes (4GB for FAT, 5TB for GCS).

Currently no tests are included in the patch and also no extra
documentation outside what is printed out by pg_dump --help . Also any
pg_log_warning lines with "CHUNKING" is there for debugging and needs
to be removed before committing.

As  implemented no changes are needed for pg_restore as all chunks are
already associated with the table in .toc and thus are restored into
this table

the attached README shows how I verified it works  and the textual
file created from the directory format dump in the last step there

--
Hannu
From 015cc46de277971d97c3b1823a5777fccb56c270 Mon Sep 17 00:00:00 2001
From: Hannu Krosing <[email protected]>
Date: Tue, 11 Nov 2025 16:11:08 +0100
Subject: [PATCH] adds ability to dump data for tables in multiple chunks
 controlled by flag --huge-table-chunk-pages

---
 src/bin/pg_dump/pg_backup.h          |   1 +
 src/bin/pg_dump/pg_backup_archiver.c |   1 +
 src/bin/pg_dump/pg_dump.c            | 157 +++++++++++++++++++++------
 src/bin/pg_dump/pg_dump.h            |   7 ++
 4 files changed, 130 insertions(+), 36 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..b71caed8b83 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -178,6 +178,7 @@ typedef struct _dumpOptions
 	bool		aclsSkip;
 	const char *lockWaitTimeout;
 	int			dump_inserts;	/* 0 = COPY, otherwise rows per INSERT */
+	int			huge_table_chunk_pages; /* chunk when relpages is above this */
 
 	/* flags for various command-line long options */
 	int			disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 59eaecb4ed7..d555e365ea5 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts)
 	opts->dumpSchema = true;
 	opts->dumpData = true;
 	opts->dumpStatistics = false;
+	opts->huge_table_chunk_pages = UINT32_MAX; /* disable chunking by default */
 }
 
 /*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a00918bacb4..e9ccc8e43ed 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -538,6 +538,7 @@ main(int argc, char **argv)
 		{"exclude-extension", required_argument, NULL, 17},
 		{"sequence-data", no_argument, &dopt.sequence_data, 1},
 		{"restrict-key", required_argument, NULL, 25},
+		{"huge-table-chunk-pages", required_argument, NULL, 26},
 
 		{NULL, 0, NULL, 0}
 	};
@@ -802,6 +803,13 @@ main(int argc, char **argv)
 				dopt.restrict_key = pg_strdup(optarg);
 				break;
 
+			case 26:			/* huge table chunk pages */
+				if (!option_parse_int(optarg, "--huge-table-chunk-pages", 1, INT32_MAX,
+									  &dopt.huge_table_chunk_pages))
+					exit_nicely(1);
+				pg_log_warning("CHUNKING: set dopt.huge_table_chunk_pages to [%u]",(BlockNumber) dopt.huge_table_chunk_pages);
+				break;
+
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1357,6 +1365,9 @@ help(const char *progname)
 	printf(_("  --extra-float-digits=NUM     override default setting for extra_float_digits\n"));
 	printf(_("  --filter=FILENAME            include or exclude objects and data from dump\n"
 			 "                               based on expressions in FILENAME\n"));
+	printf(_("  --huge-table-chunk-pages=NUMPAGES\n"
+		     "                               Number of main table pages above which data is \n"
+			 "                               copied out in chunks, also determines the chunk size\n"));
 	printf(_("  --if-exists                  use IF EXISTS when dropping objects\n"));
 	printf(_("  --include-foreign-data=PATTERN\n"
 			 "                               include data of foreign tables on foreign\n"
@@ -2397,7 +2408,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 	 * a filter condition was specified.  For other cases a simple COPY
 	 * suffices.
 	 */
-	if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+	if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
 	{
 		/* Temporary allows to access to foreign tables to dump data */
 		if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2413,9 +2424,18 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 		else
 			appendPQExpBufferStr(q, "* ");
 
-		appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+		appendPQExpBuffer(q, "FROM %s %s",
 						  fmtQualifiedDumpable(tbinfo),
 						  tdinfo->filtercond ? tdinfo->filtercond : "");
+		if (tdinfo->chunking)
+		{
+			appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+								 tdinfo->filtercond?" AND ":" WHERE ",
+								 tdinfo->startPage, tdinfo->endPage);
+			pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+		}
+		
+		appendPQExpBuffer(q, ") TO stdout;");
 	}
 	else
 	{
@@ -2423,6 +2443,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 						  fmtQualifiedDumpable(tbinfo),
 						  column_list);
 	}
+
+	pg_log_warning("CHUNKING: data query: %s", q->data);
+	
 	res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
 	PQclear(res);
 	destroyPQExpBuffer(clistBuf);
@@ -2918,42 +2941,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
 	{
 		TocEntry   *te;
 
-		te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
-						  ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
-									   .namespace = tbinfo->dobj.namespace->dobj.name,
-									   .owner = tbinfo->rolname,
-									   .description = "TABLE DATA",
-									   .section = SECTION_DATA,
-									   .createStmt = tdDefn,
-									   .copyStmt = copyStmt,
-									   .deps = &(tbinfo->dobj.dumpId),
-									   .nDeps = 1,
-									   .dumpFn = dumpFn,
-									   .dumpArg = tdinfo));
-
-		/*
-		 * Set the TocEntry's dataLength in case we are doing a parallel dump
-		 * and want to order dump jobs by table size.  We choose to measure
-		 * dataLength in table pages (including TOAST pages) during dump, so
-		 * no scaling is needed.
-		 *
-		 * However, relpages is declared as "integer" in pg_class, and hence
-		 * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
-		 * Cast so that we get the right interpretation of table sizes
-		 * exceeding INT_MAX pages.
+		/* chunking works off relpages, which may be slightly off
+		 * but is the best we have without doing our own page count
+		 * should be enough for typical use case of huge tables which 
+		 * should have their relpages updated by autovacuum
+		 * 
+		 * We shoukld likely have a slight hysteresis here to avoid
+		 * tiny shunks when relpages is close to the threshold
 		 */
-		te->dataLength = (BlockNumber) tbinfo->relpages;
-		te->dataLength += (BlockNumber) tbinfo->toastpages;
+		if ((BlockNumber) tbinfo->relpages < dopt->huge_table_chunk_pages) /* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+		{
+			pg_log_warning("CHUNKING: toc for simple relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+			te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = "TABLE DATA",
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = tdinfo));
 
-		/*
-		 * If pgoff_t is only 32 bits wide, the above refinement is useless,
-		 * and instead we'd better worry about integer overflow.  Clamp to
-		 * INT_MAX if the correct result exceeds that.
-		 */
-		if (sizeof(te->dataLength) == 4 &&
-			(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
-			 te->dataLength < 0))
-			te->dataLength = INT_MAX;
+			/*
+			* Set the TocEntry's dataLength in case we are doing a parallel dump
+			* and want to order dump jobs by table size.  We choose to measure
+			* dataLength in table pages (including TOAST pages) during dump, so
+			* no scaling is needed.
+			*
+			* However, relpages is declared as "integer" in pg_class, and hence
+			* also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+			* Cast so that we get the right interpretation of table sizes
+			* exceeding INT_MAX pages.
+			*/
+			te->dataLength = (BlockNumber) tbinfo->relpages;
+			te->dataLength += (BlockNumber) tbinfo->toastpages;
+
+			/*
+			* If pgoff_t is only 32 bits wide, the above refinement is useless,
+			* and instead we'd better worry about integer overflow.  Clamp to
+			* INT_MAX if the correct result exceeds that.
+			*/
+			if (sizeof(te->dataLength) == 4 &&
+				(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
+				te->dataLength < 0))
+				te->dataLength = INT_MAX;
+		}
+		else
+		{
+			BlockNumber current_chunk_start = 0;
+			PQExpBuffer chunk_desc = createPQExpBuffer();
+			
+			pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+			while (current_chunk_start < (BlockNumber) tbinfo->relpages)/* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */
+			{
+				TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+				memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+				AssignDumpId(&chunk_tdinfo->dobj);
+				//addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+				chunk_tdinfo->chunking = true;
+				chunk_tdinfo->startPage = current_chunk_start;
+				chunk_tdinfo->endPage = current_chunk_start + dopt->huge_table_chunk_pages - 1;
+
+				pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+				
+				current_chunk_start += dopt->huge_table_chunk_pages;
+				if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+					chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */
+
+				printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+				te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = chunk_desc->data,
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = chunk_tdinfo));
+
+				te->dataLength = dopt->huge_table_chunk_pages;
+				/* let's assume toast pages distribute evenly among chunks */
+				te->dataLength += (off_t)dopt->huge_table_chunk_pages * tbinfo->toastpages / tbinfo->relpages;
+			}
+
+			destroyPQExpBuffer(chunk_desc);
+		}
 	}
 
 	destroyPQExpBuffer(copyBuf);
@@ -3077,6 +3159,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
 	tdinfo->dobj.namespace = tbinfo->dobj.namespace;
 	tdinfo->tdtable = tbinfo;
 	tdinfo->filtercond = NULL;	/* might get set later */
+	tdinfo->chunking = false; /* defaults */
+	tdinfo->startPage = 0;
+	tdinfo->endPage = UINT32_MAX;
 	addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
 
 	/* A TableDataInfo contains data, of course */
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 72a00e1bc20..30e8160ea66 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
 
 #include "pg_backup.h"
 #include "catalog/pg_publication_d.h"
+#include "storage/block.h"
 
 
 #define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ?  1 : 0) )
@@ -413,6 +414,12 @@ typedef struct _tableDataInfo
 	DumpableObject dobj;
 	TableInfo  *tdtable;		/* link to table to dump */
 	char	   *filtercond;		/* WHERE condition to limit rows dumped */
+	bool 		chunking;
+	BlockNumber	startPage;		/* starting table page */
+	BlockNumber	endPage;		/* ending table page for page-range dump,
+	                    		 * usually startPage+huge_table_chunk_pages 
+								 * but we may want to do some small hysteresis to avoid single-page chunks
+								 */
 } TableDataInfo;
 
 typedef struct _indxInfo
-- 
2.51.2.1041.gc1ab5b90ca-goog

dumptest=# create table t1(id serial primary key, data text);
CREATE TABLE
dumptest=# insert into t1(data) values('a'), ('b'), ('c');
INSERT 0 3
dumptest=# create table t2(id serial primary key, data char(1800));
CREATE TABLE
dumptest=# insert into t2(data) select i from generate_series(1,30) g(i);
INSERT 0 30
dumptest=# select ctid from t2;
 ctid  
-------
 (0,1)
 (0,2)
 (0,3)
 (0,4)
 (1,1)
 (1,2)
 (1,3)
 (1,4)
 (2,1)
 (2,2)
 (2,3)
 (2,4)
 (3,1)
 (3,2)
 (3,3)
 (3,4)
 (4,1)
 (4,2)
 (4,3)
 (4,4)
 (5,1)
 (5,2)
 (5,3)
 (5,4)
 (6,1)
 (6,2)
 (6,3)
 (6,4)
 (7,1)
 (7,2)
(30 rows)


----------------------------
hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_dump -h /var/run/postgresql -p 5432 --format=directory --huge-table-chunk-pages=2 -f ../dumptest dumptest
pg_dump: warning: CHUNKING: set dopt.huge_table_chunk_pages to [2]
pg_dump: warning: CHUNKING: toc for simple relpages [1]
pg_dump: warning: CHUNKING: toc for chunked relpages [8]
pg_dump: warning: CHUNKING: toc for pages [0:1]
pg_dump: warning: CHUNKING: toc for pages [2:3]
pg_dump: warning: CHUNKING: toc for pages [4:5]
pg_dump: warning: CHUNKING: toc for pages [6:7]
pg_dump: warning: CHUNKING: data query: COPY public.t1 (id, data) TO stdout;
pg_dump: warning: CHUNKING: pages [0:1]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2  WHERE  ctid BETWEEN '(0,1)' AND '(1,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [2:3]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2  WHERE  ctid BETWEEN '(2,1)' AND '(3,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [4:5]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2  WHERE  ctid BETWEEN '(4,1)' AND '(5,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [6:4294967295]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2  WHERE  ctid BETWEEN '(6,1)' AND '(4294967295,32000)') TO stdout;

hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ls -l ../dumptest/
total 28
-rw-r--r-- 1 hannuk primarygroup   37 Nov 11 15:28 4012.dat.gz
-rw-r--r-- 1 hannuk primarygroup  100 Nov 11 15:28 4021.dat.gz
-rw-r--r-- 1 hannuk primarygroup  108 Nov 11 15:28 4022.dat.gz
-rw-r--r-- 1 hannuk primarygroup  110 Nov 11 15:28 4023.dat.gz
-rw-r--r-- 1 hannuk primarygroup   94 Nov 11 15:28 4024.dat.gz
-rw-r--r-- 1 hannuk primarygroup 4790 Nov 11 15:28 toc.dat

hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_restore --list ../dumptest
;
; Archive created at 2025-11-11 15:28:40 CET
;     dbname: dumptest
;     TOC Entries: 21
;     Compression: gzip
;     Dump Version: 1.16-0
;     Format: DIRECTORY
;     Integer: 4 bytes
;     Offset: 8 bytes
;     Dumped from database version: 16.4 (Debian 16.4-3+build4)
;     Dumped by pg_dump version: 19devel
;
;
; Selected TOC Entries:
;
218; 1259 1255969 TABLE public t1 hannuk
217; 1259 1255968 SEQUENCE public t1_id_seq hannuk
4019; 0 0 SEQUENCE OWNED BY public t1_id_seq hannuk
216; 1259 1255960 TABLE public t2 hannuk
215; 1259 1255959 SEQUENCE public t2_id_seq hannuk
4020; 0 0 SEQUENCE OWNED BY public t2_id_seq hannuk
3861; 2604 1255972 DEFAULT public t1 id hannuk
3860; 2604 1255963 DEFAULT public t2 id hannuk
4012; 0 1255969 TABLE DATA public t1 hannuk
4021; 0 1255960 TABLE DATA (pages 0:1) public t2 hannuk
4022; 0 1255960 TABLE DATA (pages 2:3) public t2 hannuk
4023; 0 1255960 TABLE DATA (pages 4:5) public t2 hannuk
4024; 0 1255960 TABLE DATA (pages 6:4294967295) public t2 hannuk
4025; 0 0 SEQUENCE SET public t1_id_seq hannuk
4026; 0 0 SEQUENCE SET public t2_id_seq hannuk
3865; 2606 1255976 CONSTRAINT public t1 t1_pkey hannuk
3863; 2606 1255967 CONSTRAINT public t2 t2_pkey hannuk

hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_restore ../dumptest -f ../dumptest/dump.sql

Attachment: dump.sql
Description: application/sql

Reply via email to