Attached is a patch that adds the ability to dump table data in multiple chunks.
Looking for feedback at this point: 1) what have I missed 2) should I implement something to avoid single-page chunks The flag --huge-table-chunk-pages which tells the directory format dump to dump tables where the main fork has more pages than this in multiple chunks of given number of pages, The main use case is speeding up parallel dumps in case of one or a small number of HUGE tables so parts of these can be dumped in parallel. It will also help in case the target file system has some limitations on file sizes (4GB for FAT, 5TB for GCS). Currently no tests are included in the patch and also no extra documentation outside what is printed out by pg_dump --help . Also any pg_log_warning lines with "CHUNKING" is there for debugging and needs to be removed before committing. As implemented no changes are needed for pg_restore as all chunks are already associated with the table in .toc and thus are restored into this table the attached README shows how I verified it works and the textual file created from the directory format dump in the last step there -- Hannu
From 015cc46de277971d97c3b1823a5777fccb56c270 Mon Sep 17 00:00:00 2001 From: Hannu Krosing <[email protected]> Date: Tue, 11 Nov 2025 16:11:08 +0100 Subject: [PATCH] adds ability to dump data for tables in multiple chunks controlled by flag --huge-table-chunk-pages --- src/bin/pg_dump/pg_backup.h | 1 + src/bin/pg_dump/pg_backup_archiver.c | 1 + src/bin/pg_dump/pg_dump.c | 157 +++++++++++++++++++++------ src/bin/pg_dump/pg_dump.h | 7 ++ 4 files changed, 130 insertions(+), 36 deletions(-) diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index d9041dad720..b71caed8b83 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -178,6 +178,7 @@ typedef struct _dumpOptions bool aclsSkip; const char *lockWaitTimeout; int dump_inserts; /* 0 = COPY, otherwise rows per INSERT */ + int huge_table_chunk_pages; /* chunk when relpages is above this */ /* flags for various command-line long options */ int disable_dollar_quoting; diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 59eaecb4ed7..d555e365ea5 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts) opts->dumpSchema = true; opts->dumpData = true; opts->dumpStatistics = false; + opts->huge_table_chunk_pages = UINT32_MAX; /* disable chunking by default */ } /* diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index a00918bacb4..e9ccc8e43ed 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -538,6 +538,7 @@ main(int argc, char **argv) {"exclude-extension", required_argument, NULL, 17}, {"sequence-data", no_argument, &dopt.sequence_data, 1}, {"restrict-key", required_argument, NULL, 25}, + {"huge-table-chunk-pages", required_argument, NULL, 26}, {NULL, 0, NULL, 0} }; @@ -802,6 +803,13 @@ main(int argc, char **argv) dopt.restrict_key = pg_strdup(optarg); break; + case 26: /* huge table chunk pages */ + if (!option_parse_int(optarg, "--huge-table-chunk-pages", 1, INT32_MAX, + &dopt.huge_table_chunk_pages)) + exit_nicely(1); + pg_log_warning("CHUNKING: set dopt.huge_table_chunk_pages to [%u]",(BlockNumber) dopt.huge_table_chunk_pages); + break; + default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -1357,6 +1365,9 @@ help(const char *progname) printf(_(" --extra-float-digits=NUM override default setting for extra_float_digits\n")); printf(_(" --filter=FILENAME include or exclude objects and data from dump\n" " based on expressions in FILENAME\n")); + printf(_(" --huge-table-chunk-pages=NUMPAGES\n" + " Number of main table pages above which data is \n" + " copied out in chunks, also determines the chunk size\n")); printf(_(" --if-exists use IF EXISTS when dropping objects\n")); printf(_(" --include-foreign-data=PATTERN\n" " include data of foreign tables on foreign\n" @@ -2397,7 +2408,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext) * a filter condition was specified. For other cases a simple COPY * suffices. */ - if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE) + if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE) { /* Temporary allows to access to foreign tables to dump data */ if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) @@ -2413,9 +2424,18 @@ dumpTableData_copy(Archive *fout, const void *dcontext) else appendPQExpBufferStr(q, "* "); - appendPQExpBuffer(q, "FROM %s %s) TO stdout;", + appendPQExpBuffer(q, "FROM %s %s", fmtQualifiedDumpable(tbinfo), tdinfo->filtercond ? tdinfo->filtercond : ""); + if (tdinfo->chunking) + { + appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */ + tdinfo->filtercond?" AND ":" WHERE ", + tdinfo->startPage, tdinfo->endPage); + pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage); + } + + appendPQExpBuffer(q, ") TO stdout;"); } else { @@ -2423,6 +2443,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext) fmtQualifiedDumpable(tbinfo), column_list); } + + pg_log_warning("CHUNKING: data query: %s", q->data); + res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT); PQclear(res); destroyPQExpBuffer(clistBuf); @@ -2918,42 +2941,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo) { TocEntry *te; - te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId, - ARCHIVE_OPTS(.tag = tbinfo->dobj.name, - .namespace = tbinfo->dobj.namespace->dobj.name, - .owner = tbinfo->rolname, - .description = "TABLE DATA", - .section = SECTION_DATA, - .createStmt = tdDefn, - .copyStmt = copyStmt, - .deps = &(tbinfo->dobj.dumpId), - .nDeps = 1, - .dumpFn = dumpFn, - .dumpArg = tdinfo)); - - /* - * Set the TocEntry's dataLength in case we are doing a parallel dump - * and want to order dump jobs by table size. We choose to measure - * dataLength in table pages (including TOAST pages) during dump, so - * no scaling is needed. - * - * However, relpages is declared as "integer" in pg_class, and hence - * also in TableInfo, but it's really BlockNumber a/k/a unsigned int. - * Cast so that we get the right interpretation of table sizes - * exceeding INT_MAX pages. + /* chunking works off relpages, which may be slightly off + * but is the best we have without doing our own page count + * should be enough for typical use case of huge tables which + * should have their relpages updated by autovacuum + * + * We shoukld likely have a slight hysteresis here to avoid + * tiny shunks when relpages is close to the threshold */ - te->dataLength = (BlockNumber) tbinfo->relpages; - te->dataLength += (BlockNumber) tbinfo->toastpages; + if ((BlockNumber) tbinfo->relpages < dopt->huge_table_chunk_pages) /* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */ + { + pg_log_warning("CHUNKING: toc for simple relpages [%u]",(BlockNumber) tbinfo->relpages); + + te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId, + ARCHIVE_OPTS(.tag = tbinfo->dobj.name, + .namespace = tbinfo->dobj.namespace->dobj.name, + .owner = tbinfo->rolname, + .description = "TABLE DATA", + .section = SECTION_DATA, + .createStmt = tdDefn, + .copyStmt = copyStmt, + .deps = &(tbinfo->dobj.dumpId), + .nDeps = 1, + .dumpFn = dumpFn, + .dumpArg = tdinfo)); - /* - * If pgoff_t is only 32 bits wide, the above refinement is useless, - * and instead we'd better worry about integer overflow. Clamp to - * INT_MAX if the correct result exceeds that. - */ - if (sizeof(te->dataLength) == 4 && - (tbinfo->relpages < 0 || tbinfo->toastpages < 0 || - te->dataLength < 0)) - te->dataLength = INT_MAX; + /* + * Set the TocEntry's dataLength in case we are doing a parallel dump + * and want to order dump jobs by table size. We choose to measure + * dataLength in table pages (including TOAST pages) during dump, so + * no scaling is needed. + * + * However, relpages is declared as "integer" in pg_class, and hence + * also in TableInfo, but it's really BlockNumber a/k/a unsigned int. + * Cast so that we get the right interpretation of table sizes + * exceeding INT_MAX pages. + */ + te->dataLength = (BlockNumber) tbinfo->relpages; + te->dataLength += (BlockNumber) tbinfo->toastpages; + + /* + * If pgoff_t is only 32 bits wide, the above refinement is useless, + * and instead we'd better worry about integer overflow. Clamp to + * INT_MAX if the correct result exceeds that. + */ + if (sizeof(te->dataLength) == 4 && + (tbinfo->relpages < 0 || tbinfo->toastpages < 0 || + te->dataLength < 0)) + te->dataLength = INT_MAX; + } + else + { + BlockNumber current_chunk_start = 0; + PQExpBuffer chunk_desc = createPQExpBuffer(); + + pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages); + + while (current_chunk_start < (BlockNumber) tbinfo->relpages)/* TODO: add hysteresis here, maybe < 1.1 * huge_table_chunk_pages */ + { + TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo)); + + memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo)); + AssignDumpId(&chunk_tdinfo->dobj); + //addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */ + chunk_tdinfo->chunking = true; + chunk_tdinfo->startPage = current_chunk_start; + chunk_tdinfo->endPage = current_chunk_start + dopt->huge_table_chunk_pages - 1; + + pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage); + + current_chunk_start += dopt->huge_table_chunk_pages; + if (current_chunk_start >= (BlockNumber) tbinfo->relpages) + chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */ + + printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage); + + te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId, + ARCHIVE_OPTS(.tag = tbinfo->dobj.name, + .namespace = tbinfo->dobj.namespace->dobj.name, + .owner = tbinfo->rolname, + .description = chunk_desc->data, + .section = SECTION_DATA, + .createStmt = tdDefn, + .copyStmt = copyStmt, + .deps = &(tbinfo->dobj.dumpId), + .nDeps = 1, + .dumpFn = dumpFn, + .dumpArg = chunk_tdinfo)); + + te->dataLength = dopt->huge_table_chunk_pages; + /* let's assume toast pages distribute evenly among chunks */ + te->dataLength += (off_t)dopt->huge_table_chunk_pages * tbinfo->toastpages / tbinfo->relpages; + } + + destroyPQExpBuffer(chunk_desc); + } } destroyPQExpBuffer(copyBuf); @@ -3077,6 +3159,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo) tdinfo->dobj.namespace = tbinfo->dobj.namespace; tdinfo->tdtable = tbinfo; tdinfo->filtercond = NULL; /* might get set later */ + tdinfo->chunking = false; /* defaults */ + tdinfo->startPage = 0; + tdinfo->endPage = UINT32_MAX; addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId); /* A TableDataInfo contains data, of course */ diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 72a00e1bc20..30e8160ea66 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -16,6 +16,7 @@ #include "pg_backup.h" #include "catalog/pg_publication_d.h" +#include "storage/block.h" #define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) ) @@ -413,6 +414,12 @@ typedef struct _tableDataInfo DumpableObject dobj; TableInfo *tdtable; /* link to table to dump */ char *filtercond; /* WHERE condition to limit rows dumped */ + bool chunking; + BlockNumber startPage; /* starting table page */ + BlockNumber endPage; /* ending table page for page-range dump, + * usually startPage+huge_table_chunk_pages + * but we may want to do some small hysteresis to avoid single-page chunks + */ } TableDataInfo; typedef struct _indxInfo -- 2.51.2.1041.gc1ab5b90ca-goog
dumptest=# create table t1(id serial primary key, data text);
CREATE TABLE
dumptest=# insert into t1(data) values('a'), ('b'), ('c');
INSERT 0 3
dumptest=# create table t2(id serial primary key, data char(1800));
CREATE TABLE
dumptest=# insert into t2(data) select i from generate_series(1,30) g(i);
INSERT 0 30
dumptest=# select ctid from t2;
ctid
-------
(0,1)
(0,2)
(0,3)
(0,4)
(1,1)
(1,2)
(1,3)
(1,4)
(2,1)
(2,2)
(2,3)
(2,4)
(3,1)
(3,2)
(3,3)
(3,4)
(4,1)
(4,2)
(4,3)
(4,4)
(5,1)
(5,2)
(5,3)
(5,4)
(6,1)
(6,2)
(6,3)
(6,4)
(7,1)
(7,2)
(30 rows)
----------------------------
hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_dump -h /var/run/postgresql -p 5432 --format=directory --huge-table-chunk-pages=2 -f ../dumptest dumptest
pg_dump: warning: CHUNKING: set dopt.huge_table_chunk_pages to [2]
pg_dump: warning: CHUNKING: toc for simple relpages [1]
pg_dump: warning: CHUNKING: toc for chunked relpages [8]
pg_dump: warning: CHUNKING: toc for pages [0:1]
pg_dump: warning: CHUNKING: toc for pages [2:3]
pg_dump: warning: CHUNKING: toc for pages [4:5]
pg_dump: warning: CHUNKING: toc for pages [6:7]
pg_dump: warning: CHUNKING: data query: COPY public.t1 (id, data) TO stdout;
pg_dump: warning: CHUNKING: pages [0:1]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2 WHERE ctid BETWEEN '(0,1)' AND '(1,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [2:3]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2 WHERE ctid BETWEEN '(2,1)' AND '(3,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [4:5]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2 WHERE ctid BETWEEN '(4,1)' AND '(5,32000)') TO stdout;
pg_dump: warning: CHUNKING: pages [6:4294967295]
pg_dump: warning: CHUNKING: data query: COPY (SELECT id, data FROM public.t2 WHERE ctid BETWEEN '(6,1)' AND '(4294967295,32000)') TO stdout;
hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ls -l ../dumptest/
total 28
-rw-r--r-- 1 hannuk primarygroup 37 Nov 11 15:28 4012.dat.gz
-rw-r--r-- 1 hannuk primarygroup 100 Nov 11 15:28 4021.dat.gz
-rw-r--r-- 1 hannuk primarygroup 108 Nov 11 15:28 4022.dat.gz
-rw-r--r-- 1 hannuk primarygroup 110 Nov 11 15:28 4023.dat.gz
-rw-r--r-- 1 hannuk primarygroup 94 Nov 11 15:28 4024.dat.gz
-rw-r--r-- 1 hannuk primarygroup 4790 Nov 11 15:28 toc.dat
hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_restore --list ../dumptest
;
; Archive created at 2025-11-11 15:28:40 CET
; dbname: dumptest
; TOC Entries: 21
; Compression: gzip
; Dump Version: 1.16-0
; Format: DIRECTORY
; Integer: 4 bytes
; Offset: 8 bytes
; Dumped from database version: 16.4 (Debian 16.4-3+build4)
; Dumped by pg_dump version: 19devel
;
;
; Selected TOC Entries:
;
218; 1259 1255969 TABLE public t1 hannuk
217; 1259 1255968 SEQUENCE public t1_id_seq hannuk
4019; 0 0 SEQUENCE OWNED BY public t1_id_seq hannuk
216; 1259 1255960 TABLE public t2 hannuk
215; 1259 1255959 SEQUENCE public t2_id_seq hannuk
4020; 0 0 SEQUENCE OWNED BY public t2_id_seq hannuk
3861; 2604 1255972 DEFAULT public t1 id hannuk
3860; 2604 1255963 DEFAULT public t2 id hannuk
4012; 0 1255969 TABLE DATA public t1 hannuk
4021; 0 1255960 TABLE DATA (pages 0:1) public t2 hannuk
4022; 0 1255960 TABLE DATA (pages 2:3) public t2 hannuk
4023; 0 1255960 TABLE DATA (pages 4:5) public t2 hannuk
4024; 0 1255960 TABLE DATA (pages 6:4294967295) public t2 hannuk
4025; 0 0 SEQUENCE SET public t1_id_seq hannuk
4026; 0 0 SEQUENCE SET public t2_id_seq hannuk
3865; 2606 1255976 CONSTRAINT public t1 t1_pkey hannuk
3863; 2606 1255967 CONSTRAINT public t2 t2_pkey hannuk
hannuk@hannuk007:~/work5/postgres-git/src/bin/pg_dump$ ./pg_restore ../dumptest -f ../dumptest/dump.sql
dump.sql
Description: application/sql
