Hello, > Are you not getting something informative on your display and/or sending stderr to somewher else?
- in case -X fetch no informative output - in case -X stream following: pg_basebackup: could not wait for child process: No child processes Now after re-test directly from command-line it works. It fails just when launched from a script fired by crm_mon -d -E my-script Regards, Andrej 2016-04-16 1:18 GMT+02:00 Jerry Sievers <gsiever...@comcast.net>: > Andrej Vanek <andrej.vanek...@gmail.com> writes: > > > Hello, > > > > I tried to run pg_basebackup. Return value is 1. > > > > How to find out its reason? > > (I suspect that some wal after backup is missing- but how to find > out the real reason? How to fix it?) > > Well there are more than 80 cases of exit 1 in a couple .c files and > most/all are preceeded unsurprisingly with an fprintf(stderr. > > Are you not getting something informative on your display and/or sending > stderr to somewher else? > > $ echo $* > ./src/bin/pg_basebackup/pg_basebackup.c > ./src/backend/replication/basebackup.c > > $ grep -h -B 4 -i 'exit.*(1' $* | egrep -hi 'fprintf\(stderr|exit' > fprintf(stderr, _("%s: directory name too > long\n"), progname); > exit(1); > fprintf(stderr, _("%s: multiple \"=\" > signs in tablespace mapping\n"), progname); > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, _("%s: old directory is not an absolute > path in tablespace mapping: %s\n"), > exit(1); > fprintf(stderr, _("%s: new directory is not an absolute > path in tablespace mapping: %s\n"), > exit(1); > fprintf(stderr, _("%s: could not read from > ready pipe: %s\n"), > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not create background > process: %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not create background thread: > %s\n"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not access directory > \"%s\": %s\n"), > disconnect_and_exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, _("%s: transfer rate must be greater than > zero\n"), > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not write to file > \"%s\": %s\n"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not create file > \"%s\": %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not get COPY data stream: > %s"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not read COPY data: > %s"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not get COPY data stream: > %s"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not read COPY data: > %s"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: invalid tar block > header size: %d\n"), > disconnect_and_exit(1); > fprintf(stderr, > > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not create > file \"%s\": %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not write to > file \"%s\": %s\n"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: out of memory\n"), progname); > exit(1); > fprintf(stderr, _("%s: out of memory\n"), progname); > disconnect_and_exit(1); > fprintf(stderr, _("%s: out of memory\n"), progname); > disconnect_and_exit(1); > fprintf(stderr, _("%s: out of memory\n"), progname); > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not create file \"%s\": > %s\n"), progname, filename, strerror(errno)); > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > exit(1); > fprintf(stderr, _("%s: incompatible server version %s\n"), > disconnect_and_exit(1); > disconnect_and_exit(1); > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not send replication command > \"%s\": %s"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not initiate base backup: > %s"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not get backup header: %s"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: no data returned from server\n"), > progname); > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: final receive failed: %s"), > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not wait for child > process: %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: child %d died, expected > %d\n"), > disconnect_and_exit(1); > if (!WIFEXITED(status)) > fprintf(stderr, _("%s: child process did not exit > normally\n"), > disconnect_and_exit(1); > if (WEXITSTATUS(status) != 0) > fprintf(stderr, _("%s: child process exited with > error %d\n"), > progname, WEXITSTATUS(status)); > disconnect_and_exit(1); > fprintf(stderr, > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not wait for child > thread: %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: could not get child thread > exit status: %s\n"), > disconnect_and_exit(1); > fprintf(stderr, _("%s: child thread exited with > error %u\n"), > disconnect_and_exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, _("%s: invalid > compression level \"%s\"\n"), > exit(1); > fprintf(stderr, _("%s: invalid > checkpoint argument \"%s\", must be \"fast\" or \"spread\"\n"), > exit(1); > fprintf(stderr, _("%s: invalid > status interval \"%s\"\n"), > exit(1); > fprintf(stderr, _("Try \"%s --help\" for > more information.\n"), > exit(1); > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, _("%s: no target directory specified\n"), > progname); > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, _("%s: transaction log directory > location must be " > fprintf(stderr, _("Try \"%s --help\" for more > information.\n"), > exit(1); > fprintf(stderr, > exit(1); > fprintf(stderr, _("%s: could not create symbolic > link \"%s\": %s\n"), > exit(1); > fprintf(stderr, _("%s: symlinks are not supported on this > platform\n")); > exit(1); > > > > > > thanks, Andrej > > --------------details: > > environment: CentOS 6.7, postgres 9.5.1 > > ( PostgreSQL 9.5.1 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.4.7 > 20120313 (Red Hat 4.4.7-16), 64-bit) > > > > I tried 2 forms of pg_basebackup (-X fetch and -X stream). Both were > issued from a script: > > # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} > -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X fetch" 2>${LOG_FILE}.stderr >> > ${LOG_FILE} > > # echo $? > > 1 <--------------pg_basebackup failed! > > # cat log.stderr > > # cat /var/log/cluster/geo_repair.log.err > > transaction log start point: 0/E3000028 on timeline 1 > > WARNING: skipping special file "./pg_hba.conf" > > WARNING: skipping special file "./pg_hba.conf.save" > > transaction log end point: 0/E30000F8 > > pg_basebackup: base backup completed <------------------no > reason for pg_basebackup failure! > > # cp /tmp/pg_hba.conf /tmp/postgresql.conf /pg_data/ > > # su - postgres -c "/usr/pgsql-9.5/bin/pg_ctl -D /pg_data/ start" > > # tail /pg_data/pg_log/postgresql-Fri.log > > `pg_xlog/0000000100000000000000E2' -> > `../backups/arc/0000000100000000000000E2' > > 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING: skipping > special file "./pg_hba.conf" > > 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING: skipping > special file "./pg_hba.conf.save" <---------------recorded in > pg_log on master node and > > copied by pg_basebackup (note time difference between two servers) > > 2016-04-15 23:15:02 CEST:@:[23321] LOG: database system was > interrupted; last known up at 2016-04-15 23:15:10 CEST > > 2016-04-15 23:15:02 CEST:postgres@postgres:[23329] FATAL: the database > system is starting up > > 2016-04-15 23:15:03 CEST:@:[23321] LOG: entering standby mode > > 2016-04-15 23:15:03 CEST:@:[23321] LOG: database system was not > properly shut down; automatic recovery in progress <---------something > missing from pg_basebackup > > 2016-04-15 23:15:03 CEST:@:[23321] LOG: redo starts at 0/E3000028 > > 2016-04-15 23:15:03 CEST:@:[23321] LOG: consistent recovery state > reached at 0/E4000000 > > 2016-04-15 23:15:03 CEST:@:[23295] LOG: database system is ready to > accept read only connections > > 2016-04-15 23:15:03 CEST:@:[23356] LOG: started streaming WAL from > primary at 0/E4000000 on timeline 1 > > -------second trial > > # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} > -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X stream" > > # echo $? > > 1 > > # cat /var/log/cluster/geo_repair.log.err > > transaction log start point: 0/E5000028 on timeline 1 > > pg_basebackup: starting background WAL receiver > > WARNING: skipping special file "./pg_hba.conf" > > WARNING: skipping special file "./pg_hba.conf.save" > > transaction log end point: 0/E50000F8 > > pg_basebackup: waiting for background process to finish streaming ... > > pg_basebackup: could not wait for child process: No child processes > <----what does this mean? I think it failed to start process to fetching > wal logs created > > during backup: but neither on master node neither on pg_basebackup > output here is any information about reason.. (max_wal_senders on master is > 10: I see no reason to > > fail). > > > > postgres logs: > > `pg_xlog/0000000100000000000000E4' -> > `../backups/arc/0000000100000000000000E4' > > 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING: skipping > special file "./pg_hba.conf" > > 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING: skipping > special file "./pg_hba.conf.save" > > 2016-04-15 23:35:01 CEST:@:[28926] LOG: database system was > interrupted; last known up at 2016-04-15 23:35:09 CEST > > 2016-04-15 23:35:01 CEST:postgres@postgres:[28938] FATAL: the database > system is starting up > > 2016-04-15 23:35:02 CEST:@:[28926] LOG: entering standby mode > > 2016-04-15 23:35:02 CEST:@:[28926] LOG: database system was not > properly shut down; automatic recovery in progress <------------this means > something missing from > > pg_basebackup > > 2016-04-15 23:35:02 CEST:@:[28926] LOG: redo starts at 0/E5000028 > > 2016-04-15 23:35:02 CEST:@:[28926] LOG: consistent recovery state > reached at 0/E6000000 > > 2016-04-15 23:35:02 CEST:@:[28904] LOG: database system is ready to > accept read only connections > > 2016-04-15 23:35:02 CEST:@:[28989] LOG: started streaming WAL from > primary at 0/E6000000 on timeline 1 > > > > postgres params on master node: > > log_line_prefix = '%t:%u@%d:[%p] ' > > logging_collector = on > > wal_buffers = 16MB > > max_wal_size = 200MB > > log_temp_files = 1MB > > max_connections = 170 > > shared_buffers = 512MB > > effective_cache_size = 1500MB > > work_mem = 48MB > > log_lock_waits = on > > log_min_duration_statement = 10000 > > shared_preload_libraries = 'pg_stat_statements' > > include '/var/lib/pgsql/tmp/rep_mode.conf' # added by pgsql RA > > wal_level = hot_standby > > archive_mode = on > > max_wal_senders = 10 > > hot_standby = on > > wal_keep_segments = 128 > > archive_command = '/opt/postgres/dbconf/archive_command.sh %p %f' > > wal_receiver_status_interval = 2 > > max_standby_streaming_delay = -1 > > max_standby_archive_delay = -1 > > restart_after_crash = off > > hot_standby_feedback = on > > > > -- > Jerry Sievers > Postgres DBA/Development Consulting > e: postgres.consult...@comcast.net > p: 312.241.7800 >