Fixing all the warnings
On Wed, Jan 21, 2026 at 2:05 PM Hannu Krosing <[email protected]> wrote: > > Please find the latest patch attached which incorporates the feedback > received. > > * changed flag name to --max-table-segment-pages > * added check for amname = "heap" > * switched to using of > pg_relation_size(c.oid)/current_setting('block_size')::int when > --max-table-segment-pages is set > * added option_parse_uint32(...) to be used for full range of pages numbers > * The COPY SELECTs now use <= , BETWEEN or >= depending on the segment > position > > * added documentation > > * TESTS: > * added simple chunked dump and restore test > * added a WARNING with count and table data hash to source and > chunked restore database > > I left in the boolean to indicate if this is a full table or chunk > (was named chunking, nor is_segment) > > An a lternative would be to use an expression like (td->startPage != 0 > || td->endPage != InvalidBlockNumber) whenever td->is_segment is > needed > > If you insist on not having a separate structure member we could turn > this into something like this > > #define is_segment(td) ((td->startPage != 0 || td->endPage != > InvalidBlockNumber)) > > and then use is_segment(td) instead of td->is_segment where needed.
From 9a77781df2140cb5676a79eea6b32934672ee36f Mon Sep 17 00:00:00 2001 From: Hannu Krosing <[email protected]> Date: Thu, 22 Jan 2026 17:56:34 +0100 Subject: [PATCH v10] * changed flag name to max-table-segment-pages * added check for amname = "heap" * added simple chunked dump and restore test * switched to using of pg_relation_size(c.oid)/current_setting('block_size')::int when --max-table-segment-pages is set * added documentation * added option_parse_uint32(...) to be used for full range of pages numbers * .. and fixed the warnings * TESTS: added a WARNING with count and table data hash to source and chunked restore database --- doc/src/sgml/ref/pg_dump.sgml | 24 +++ src/bin/pg_dump/pg_backup.h | 2 + src/bin/pg_dump/pg_backup_archiver.c | 2 + src/bin/pg_dump/pg_dump.c | 170 +++++++++++++++++----- src/bin/pg_dump/pg_dump.h | 7 + src/bin/pg_dump/t/004_pg_dump_parallel.pl | 52 +++++++ src/fe_utils/option_utils.c | 57 ++++++++ src/include/fe_utils/option_utils.h | 3 + 8 files changed, 282 insertions(+), 35 deletions(-) diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml index 688e23c0e90..1811c67d141 100644 --- a/doc/src/sgml/ref/pg_dump.sgml +++ b/doc/src/sgml/ref/pg_dump.sgml @@ -1088,6 +1088,30 @@ PostgreSQL documentation </listitem> </varlistentry> + <varlistentry> + <term><option>--max-table-segment-pages=<replaceable class="parameter">npages</replaceable></option></term> + <listitem> + <para> + Dump data in segments based on number of pages in the main relation. + If the number of data pages in the relation is more than <replaceable class="parameter">npages</replaceable> + the data is split into segments based on that number of pages. + Individual segments can be dumped in parallel. + </para> + + <note> + <para> + The option <option>--max-table-segment-pages</option> is applied to only pages + in the main heap and if the table has a large TOASTed part this has to be + taken into account when deciding on the number of pages to use. + In the extreme case a single 8kB heap page can have ~200 toast pointers each + corresponding to 1GB of data. If this data is also non-compressible then a + single-page segment can dump as 200GB file. + </para> + </note> + + </listitem> + </varlistentry> + <varlistentry> <term><option>--no-comments</option></term> <listitem> diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h index d9041dad720..b63ae05d895 100644 --- a/src/bin/pg_dump/pg_backup.h +++ b/src/bin/pg_dump/pg_backup.h @@ -27,6 +27,7 @@ #include "common/file_utils.h" #include "fe_utils/simple_list.h" #include "libpq-fe.h" +#include "storage/block.h" typedef enum trivalue @@ -178,6 +179,7 @@ typedef struct _dumpOptions bool aclsSkip; const char *lockWaitTimeout; int dump_inserts; /* 0 = COPY, otherwise rows per INSERT */ + BlockNumber max_table_segment_pages; /* chunk when relpages is above this */ /* flags for various command-line long options */ int disable_dollar_quoting; diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c index 4a63f7392ae..ed1913d66bc 100644 --- a/src/bin/pg_dump/pg_backup_archiver.c +++ b/src/bin/pg_dump/pg_backup_archiver.c @@ -44,6 +44,7 @@ #include "pg_backup_archiver.h" #include "pg_backup_db.h" #include "pg_backup_utils.h" +#include "storage/block.h" #define TEXT_DUMP_HEADER "--\n-- PostgreSQL database dump\n--\n\n" #define TEXT_DUMPALL_HEADER "--\n-- PostgreSQL database cluster dump\n--\n\n" @@ -154,6 +155,7 @@ InitDumpOptions(DumpOptions *opts) opts->dumpSchema = true; opts->dumpData = true; opts->dumpStatistics = false; + opts->max_table_segment_pages = InvalidBlockNumber; } /* diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 687dc98e46d..ca7e9c5eeba 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -539,6 +539,7 @@ main(int argc, char **argv) {"exclude-extension", required_argument, NULL, 17}, {"sequence-data", no_argument, &dopt.sequence_data, 1}, {"restrict-key", required_argument, NULL, 25}, + {"max-table-segment-pages", required_argument, NULL, 26}, {NULL, 0, NULL, 0} }; @@ -803,6 +804,13 @@ main(int argc, char **argv) dopt.restrict_key = pg_strdup(optarg); break; + case 26: + if (!option_parse_uint32(optarg, "--max-table-segment-pages", 1, MaxBlockNumber, + &dopt.max_table_segment_pages)) + exit_nicely(1); + pg_log_warning("CHUNKING: set dopt.max_table_segment_pages to [%u]",(BlockNumber) dopt.max_table_segment_pages); + break; + default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); @@ -1372,6 +1380,9 @@ help(const char *progname) printf(_(" --extra-float-digits=NUM override default setting for extra_float_digits\n")); printf(_(" --filter=FILENAME include or exclude objects and data from dump\n" " based on expressions in FILENAME\n")); + printf(_(" --max-table-segment-pages=NUMPAGES\n" + " Number of main table pages above which data is \n" + " copied out in chunks, also determines the chunk size\n")); printf(_(" --if-exists use IF EXISTS when dropping objects\n")); printf(_(" --include-foreign-data=PATTERN\n" " include data of foreign tables on foreign\n" @@ -2412,7 +2423,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext) * a filter condition was specified. For other cases a simple COPY * suffices. */ - if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE) + if (tdinfo->filtercond || tdinfo->is_segment || tbinfo->relkind == RELKIND_FOREIGN_TABLE) { /* Temporary allows to access to foreign tables to dump data */ if (tbinfo->relkind == RELKIND_FOREIGN_TABLE) @@ -2428,9 +2439,25 @@ dumpTableData_copy(Archive *fout, const void *dcontext) else appendPQExpBufferStr(q, "* "); - appendPQExpBuffer(q, "FROM %s %s) TO stdout;", + appendPQExpBuffer(q, "FROM %s %s", fmtQualifiedDumpable(tbinfo), tdinfo->filtercond ? tdinfo->filtercond : ""); + if (tdinfo->is_segment) + { + appendPQExpBufferStr(q, tdinfo->filtercond?" AND ":" WHERE "); + if(tdinfo->startPage == 0) + appendPQExpBuffer(q, "ctid <= '(%u,32000)'", /* there is no (*,0) tuple */ + tdinfo->endPage); + else if(tdinfo->endPage != InvalidBlockNumber) + appendPQExpBuffer(q, "ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */ + tdinfo->startPage, tdinfo->endPage); + else + appendPQExpBuffer(q, "ctid >= '(%u,1)'", /* there is no (*,0) tuple */ + tdinfo->startPage); + pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage); + } + + appendPQExpBuffer(q, ") TO stdout;"); } else { @@ -2438,6 +2465,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext) fmtQualifiedDumpable(tbinfo), column_list); } + + pg_log_warning("CHUNKING: data query: %s", q->data); + res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT); PQclear(res); destroyPQExpBuffer(clistBuf); @@ -2933,41 +2963,101 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo) { TocEntry *te; - te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId, - ARCHIVE_OPTS(.tag = tbinfo->dobj.name, - .namespace = tbinfo->dobj.namespace->dobj.name, - .owner = tbinfo->rolname, - .description = "TABLE DATA", - .section = SECTION_DATA, - .createStmt = tdDefn, - .copyStmt = copyStmt, - .deps = &(tbinfo->dobj.dumpId), - .nDeps = 1, - .dumpFn = dumpFn, - .dumpArg = tdinfo)); - - /* - * Set the TocEntry's dataLength in case we are doing a parallel dump - * and want to order dump jobs by table size. We choose to measure - * dataLength in table pages (including TOAST pages) during dump, so - * no scaling is needed. - * - * However, relpages is declared as "integer" in pg_class, and hence - * also in TableInfo, but it's really BlockNumber a/k/a unsigned int. - * Cast so that we get the right interpretation of table sizes - * exceeding INT_MAX pages. + /* data chunking works off relpages, which are computed exactly using + * pg_relation_size() when --max-table-segment-pages was set + * + * We also don't chunk if table access method is not "heap" + * TODO: we may add chunking for other access methods later, maybe + * based on primary key tranges */ - te->dataLength = (BlockNumber) tbinfo->relpages; - te->dataLength += (BlockNumber) tbinfo->toastpages; + if ((BlockNumber) tbinfo->relpages <= dopt->max_table_segment_pages || + strcmp(tbinfo->amname, "heap") != 0) + { + te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId, + ARCHIVE_OPTS(.tag = tbinfo->dobj.name, + .namespace = tbinfo->dobj.namespace->dobj.name, + .owner = tbinfo->rolname, + .description = "TABLE DATA", + .section = SECTION_DATA, + .createStmt = tdDefn, + .copyStmt = copyStmt, + .deps = &(tbinfo->dobj.dumpId), + .nDeps = 1, + .dumpFn = dumpFn, + .dumpArg = tdinfo)); - /* - * If pgoff_t is only 32 bits wide, the above refinement is useless, - * and instead we'd better worry about integer overflow. Clamp to - * INT_MAX if the correct result exceeds that. - */ + /* + * Set the TocEntry's dataLength in case we are doing a parallel dump + * and want to order dump jobs by table size. We choose to measure + * dataLength in table pages (including TOAST pages) during dump, so + * no scaling is needed. + * + * However, relpages is declared as "integer" in pg_class, and hence + * also in TableInfo, but it's really BlockNumber a/k/a unsigned int. + * Cast so that we get the right interpretation of table sizes + * exceeding INT_MAX pages. + */ + te->dataLength = (BlockNumber) tbinfo->relpages; + te->dataLength += (BlockNumber) tbinfo->toastpages; + } + else + { + BlockNumber current_chunk_start = 0; + PQExpBuffer chunk_desc = createPQExpBuffer(); + + pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages); + + while (current_chunk_start < (BlockNumber) tbinfo->relpages) + { + TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo)); + + memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo)); + AssignDumpId(&chunk_tdinfo->dobj); + //addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */ + chunk_tdinfo->is_segment = true; + chunk_tdinfo->startPage = current_chunk_start; + chunk_tdinfo->endPage = current_chunk_start + dopt->max_table_segment_pages - 1; + + pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage); + + current_chunk_start += dopt->max_table_segment_pages; + if (current_chunk_start >= (BlockNumber) tbinfo->relpages) + chunk_tdinfo->endPage = InvalidBlockNumber; /* last chunk is for "all the rest" */ + + printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage); + + te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId, + ARCHIVE_OPTS(.tag = tbinfo->dobj.name, + .namespace = tbinfo->dobj.namespace->dobj.name, + .owner = tbinfo->rolname, + .description = chunk_desc->data, + .section = SECTION_DATA, + .createStmt = tdDefn, + .copyStmt = copyStmt, + .deps = &(tbinfo->dobj.dumpId), + .nDeps = 1, + .dumpFn = dumpFn, + .dumpArg = chunk_tdinfo)); + + if(chunk_tdinfo->endPage == InvalidBlockNumber) + te->dataLength = (BlockNumber) tbinfo->relpages - chunk_tdinfo->startPage; + else + te->dataLength = dopt->max_table_segment_pages; + /* let's assume toast pages distribute evenly among chunks */ + if(tbinfo->relpages) + te->dataLength += te->dataLength * tbinfo->toastpages / tbinfo->relpages; + } + + destroyPQExpBuffer(chunk_desc); + } + /* + * If pgoff_t is only 32 bits wide, the above refinement is useless, + * and instead we'd better worry about integer overflow. Clamp to + * INT_MAX if the correct result exceeds that. + */ if (sizeof(te->dataLength) == 4 && (tbinfo->relpages < 0 || tbinfo->toastpages < 0 || - te->dataLength < 0)) + te->dataLength < 0)) te->dataLength = INT_MAX; } @@ -3092,6 +3182,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo) tdinfo->dobj.namespace = tbinfo->dobj.namespace; tdinfo->tdtable = tbinfo; tdinfo->filtercond = NULL; /* might get set later */ + tdinfo->is_segment = false; /* we could use (tdinfo->startPage != 0 || tdinfo->endPage != InvalidBlockNumber) */ + tdinfo->startPage = 0; + tdinfo->endPage = InvalidBlockNumber; addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId); /* A TableDataInfo contains data, of course */ @@ -7254,8 +7347,15 @@ getTables(Archive *fout, int *numTables) "c.relnamespace, c.relkind, c.reltype, " "c.relowner, " "c.relchecks, " - "c.relhasindex, c.relhasrules, c.relpages, " - "c.reltuples, c.relallvisible, "); + "c.relhasindex, c.relhasrules, "); + + /* fetch current relation size if chunking is requested */ + if(dopt->max_table_segment_pages != InvalidBlockNumber) + appendPQExpBufferStr(query, "pg_relation_size(c.oid)/current_setting('block_size')::int AS relpages, "); + else + appendPQExpBufferStr(query, "c.relpages, "); + + appendPQExpBufferStr(query, "c.reltuples, c.relallvisible, "); if (fout->remoteVersion >= 180000) appendPQExpBufferStr(query, "c.relallfrozen, "); diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 4c4b14e5fc7..e362253d4d5 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -16,6 +16,7 @@ #include "pg_backup.h" #include "catalog/pg_publication_d.h" +#include "storage/block.h" #define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) ) @@ -413,6 +414,12 @@ typedef struct _tableDataInfo DumpableObject dobj; TableInfo *tdtable; /* link to table to dump */ char *filtercond; /* WHERE condition to limit rows dumped */ + bool is_segment; /* true if this is a data segment. + * we could use (tdinfo->startPage != 0 || + * tdinfo->endPage != InvalidBlockNumber) */ + BlockNumber startPage; /* starting table page */ + BlockNumber endPage; /* ending table page for page-range dump, + * mostly startPage+max_table_segment_pages */ } TableDataInfo; typedef struct _indxInfo diff --git a/src/bin/pg_dump/t/004_pg_dump_parallel.pl b/src/bin/pg_dump/t/004_pg_dump_parallel.pl index 738f34b1c1b..88af25d2889 100644 --- a/src/bin/pg_dump/t/004_pg_dump_parallel.pl +++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl @@ -11,6 +11,7 @@ use Test::More; my $dbname1 = 'regression_src'; my $dbname2 = 'regression_dest1'; my $dbname3 = 'regression_dest2'; +my $dbname4 = 'regression_dest3'; my $node = PostgreSQL::Test::Cluster->new('main'); $node->init; @@ -21,6 +22,7 @@ my $backupdir = $node->backup_dir; $node->run_log([ 'createdb', $dbname1 ]); $node->run_log([ 'createdb', $dbname2 ]); $node->run_log([ 'createdb', $dbname3 ]); +$node->run_log([ 'createdb', $dbname4 ]); $node->safe_psql( $dbname1, @@ -44,6 +46,18 @@ create table tht_p1 partition of tht for values with (modulus 3, remainder 0); create table tht_p2 partition of tht for values with (modulus 3, remainder 1); create table tht_p3 partition of tht for values with (modulus 3, remainder 2); insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x; + +-- raise warning so I can check in .log if data was correct +DO \$\$ +DECLARE + thash_rec RECORD; +BEGIN + SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash + INTO thash_rec + FROM tplain AS t; + RAISE WARNING 'thash: %', thash_rec; +END; +\$\$; }); $node->command_ok( @@ -87,4 +101,42 @@ $node->command_ok( ], 'parallel restore as inserts'); +$node->command_ok( + [ + 'pg_dump', + '--format' => 'directory', + '--max-table-segment-pages' => 2, + '--no-sync', + '--jobs' => 2, + '--file' => "$backupdir/dump3", + $node->connstr($dbname1), + ], + 'parallel dump with chunks of two heap pages'); + +$node->command_ok( + [ + 'pg_restore', '--verbose', + '--dbname' => $node->connstr($dbname4), + '--jobs' => 3, + "$backupdir/dump3", + ], + 'parallel restore with chunks of two heap pages'); + +$node->safe_psql( + $dbname4, + qq{ + +-- raise warning so I can check in .log if data was correct +DO \$\$ +DECLARE + thash_rec RECORD; +BEGIN + SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash + INTO thash_rec + FROM tplain AS t; + RAISE WARNING 'thash after parallel chunked restore: %', thash_rec; +END; +\$\$; + }); + done_testing(); diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c index cc483ae176c..958f47711aa 100644 --- a/src/fe_utils/option_utils.c +++ b/src/fe_utils/option_utils.c @@ -83,6 +83,63 @@ option_parse_int(const char *optarg, const char *optname, return true; } +/* + * option_parse_int + * + * Parse integer value for an option. If the parsing is successful, returns + * true and stores the result in *result if that's given; if parsing fails, + * returns false. + */ +bool +option_parse_uint32(const char *optarg, const char *optname, + uint64 min_range, uint64 max_range, + uint32 *result) +{ + const char *chkptr; + char *endptr; + uint64 val64; + + /* check there is no minus sign in value because strtoul() + * will silently convert negative numbers to two's complement */ + for(chkptr = optarg; *chkptr != '\0'; chkptr++) + if(*chkptr == '-') + { + pg_log_error("value \"%s\" for option %s can not be negative", + optarg, optname); + return false; + } + + errno = 0; + val64 = strtoull(optarg, &endptr, 10); + + /* + * Skip any trailing whitespace; if anything but whitespace remains before + * the terminating character, fail. + */ + while (*endptr != '\0' && isspace((unsigned char) *endptr)) + endptr++; + + if (*endptr != '\0') + { + pg_log_error("invalid value \"%s\" for option %s", + optarg, optname); + return false; + } + + if (errno == ERANGE || val64 < min_range || val64 > max_range) + { + pg_log_error("%s musst be in range %lu..%lu", + optname, + (long unsigned int) min_range, + (long unsigned int) max_range); + return false; + } + + if (result) + *result = (uint32)val64; + return true; +} + /* * Provide strictly harmonized handling of the --sync-method option. */ diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h index 0db6e3b6e91..268590a18bd 100644 --- a/src/include/fe_utils/option_utils.h +++ b/src/include/fe_utils/option_utils.h @@ -22,6 +22,9 @@ extern void handle_help_version_opts(int argc, char *argv[], extern bool option_parse_int(const char *optarg, const char *optname, int min_range, int max_range, int *result); +extern bool option_parse_uint32(const char *optarg, const char *optname, + uint64 min_range, uint64 max_range, + uint32 *result); extern bool parse_sync_method(const char *optarg, DataDirSyncMethod *sync_method); -- 2.43.0
