George B. et al, --Is it normal to top-post on this list? I am following your example but other lists I am on prefer bottom-posting. --I attach the complete code of the andmsg program, as it is quite short (some bits removed for brevity and I have omitted my headers and startup function aninit() as they are probably irrelevant to the problem). The idea I had was to have each node in the main program send a SHUTDOWN_ANDMSG when it is ready to shut down and andmsg counts these and does not begin its own shutdown until the number received matches the number of nodes in the main program. Debugging shows that the number it is counting is correct. It then sends a message back to the main application so each node there waits to shut down until they are all ready to shut down. I developed this protocol with tiny test programs where just having everybody stop when finished led to similar MPI console messages. In my initial posting I did not show the code that counts these messages; that is why your matchup seems wrong. Also, I have since added code to the main nodes to do an IProbe before stopping and report any unread messages. There are none. --I cannot really post the main program, as it is about 40,000 lines of C code, recently updated for parallel processing with MPI. When I have time I will try to make a short version for further testing. Thanks, George Reeke
On Mon, 2016-10-10 at 21:37 -0400, George Bosilca wrote: > George, > > > There is too much information missing from your example. If I try to > run the code on the top assuming the process is is_host(NC.node), I > have on NC.commd 3 communications (ignore the others): > > > rc = MPI_Send(&ival, 1, MPI_INT, NC.dmsgid, > SHUTDOWN_ANDMSG, NC.commd); > MPI_Recv(&ival, 1, MPI_INT, NC.dmsgid, CLOSING_ANDMSG, NC.commd, > MPI_STATUS_IGNORE); > rc = MPI_Send(&ival, 1, MPI_INT, NC.dmsgid, SHUTDOWN_ANDMSG, > NC.commd); } > > > > On the andmsg I can only see 2 matching communications: > > > rc = MPI_Send(&num2stop, 1, MPI_INT, NC.hostid, CLOSING_ANDMSG, > NC.commd); > rc = MPI_Recv(&sdmsg, 1, MPI_INT, NC.hostid, MPI_ANY_TAG, NC.commd, > MPI_STATUS_IGNORE); > > > > So either there is a pending send (which is treated as an eager by > OMPI because it is of length 4 bytes), or there is something missing > on the code example. Can you post a more complete example ? > > > Thanks, > George. > >
/* (c) Copyright 2016, The Rockefeller University */ /* $Id: andmsg.c 1 2009-12-29 23:04:48Z $ */ /*********************************************************************** * andmsg.c * * * * EDITED VERSION FOR POSTING * * * * This is the main program spawned by a parallel program using the * * MPI library to receive "out-of-stream" messages and act upon them. * * * * This program should receive (as ASCII strings) two command-line * * arguments from aninit in the parent (spawning) process: * * (1) The number of SHUTDOWN_ANDMSG messages needed to quit * * (2) Any debug codes relevant to this process * *----------------------------------------------------------------------* * V1A, 07/06/16, GNR - New program * * ==>, 08/10/16, GNR - Last mod before committing to svn repository * ***********************************************************************/ #define MAIN #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include "sysdef.h" #include "mpitools.h" #include "swap.h" #define ANDDBG int main(int argc, char *argv[]) { #ifdef ANDDBG FILE *andlog; /* Debug log file */ #endif struct { /* Struct for receiving messages */ /* This inner struct is used by the main application to * encode information for error messages going to andmsg. */ struct ErrMsg Err; char msg[MAX_EMSG_LENGTH]; } emsg; MPI_Status mstat; int msrc, mtag; int sdmsg; int num2stop; /* Startup */ /* This routine makes inquiries of the MPI library to fill * in the communicators, numbers of nodes, etc. in the NC * common structure. The header mpitools.h defines this * structure. */ aninit(0, 0, 0); MPI_Comm_get_parent(&NC.commd); /* Interpret command-line arguments */ if (argc > 1) NC.debug = atoi(argv[2]); if (argc > 0) num2stop = atoi(argv[1]); if (NC.debug & DBG_START) { /* ssprintf is my own version of sprintf with some mods */ fputs(ssprintf(NULL,"andmsg started w/num2stop = %d, " "NC.debug = 0x%x\n", num2stop, NC.debug), stderr); fflush(stderr); } #ifdef ANDDBG andlog = fopen("/var/tmp/andmsg.debug.log", "w"); #endif /* Loop waiting for messages */ while (1) { /* Terminate if all nodes have finished */ if (num2stop <= 0) { int rc; #ifdef ANDDBG fputs("Andmsg got num2stop == 0\n", andlog); fflush(andlog); #endif /* Without this last ack, kept getting uninterpretable errors * when some node called MPI_Finalize. These errors did not * happen if all nodes sat on a DBG_NOXIT and were stepped * through MPI_Finalize() individually. -GNR */ rc = MPI_Send(&num2stop, 1, MPI_INT, NC.hostid, CLOSING_ANDMSG, NC.commd); #ifdef ANDDBG fputs(ssprintf(NULL, "Andmsg sent CLOSING, rc = %d\n", rc), andlog); fflush(andlog); #endif rc = MPI_Recv(&sdmsg, 1, MPI_INT, NC.hostid, MPI_ANY_TAG, NC.commd, MPI_STATUS_IGNORE); #ifdef ANDDBG fputs(ssprintf(NULL, "Andmsg recvd 2nd CLOSING, rc = %d\n", rc), andlog); fflush(andlog); #endif sleep(1); rc = MPI_Comm_disconnect(&NC.commd); #ifdef ANDDBG fputs(ssprintf(NULL, "Andmsg disconnected, rc = %d\n", rc), andlog); fflush(andlog); #endif rc = MPI_Finalize(); #ifdef ANDDBG fputs(ssprintf(NULL, "Andmsg finalized, rc = %d\n", rc), andlog); fflush(andlog); fclose(andlog); #endif exit(0); } /* Block until a message arrives */ MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, NC.commd, &mstat); msrc = mstat.MPI_SOURCE, mtag = mstat.MPI_TAG; switch (mstat.MPI_TAG) { /* Handle debug output */ case DEBUG_MSG: MPI_Recv(emsg.msg, MAX_EMSG_LENGTH, MPI_UNSIGNED_CHAR, msrc, mtag, NC.commd, &mstat); fputs(ssprintf(NULL, "DBG MSG from Node %d: ", mstat.MPI_SOURCE), stderr); emsg.msg[MAX_EMSG_LENGTH-1] = '\0'; /* JIC */ fputs(emsg.msg, stderr); fputs("\n", stderr); fflush(stderr); break; /* Handle terminal messages */ case INIT_ABORT_MSG: /* Code for this case was removed here from posting version * for brevity. This switch case is not exercised in the problem that was posted. */ /* Clean termination, just shut down */ case SHUTDOWN_ANDMSG: /* Clear the message or the probe keeps returning */ MPI_Recv(&sdmsg, 1, MPI_INT, msrc, mtag, NC.commd, MPI_STATUS_IGNORE); num2stop -= 1; break; /* Unidentified tag, read and print message anyway */ default: MPI_Recv(emsg.msg, MAX_EMSG_LENGTH, MPI_UNSIGNED_CHAR, msrc, mtag, NC.commd, &mstat); fputs(ssprintf(NULL, "UNIDENTIFIED MSG from Node %d, Tag %d IGNORED\n", mstat.MPI_SOURCE, mstat.MPI_TAG), stderr); fflush(stderr); break; } /* End msgtype switch */ } /* End main wait loop */ } /* End andmsg() */
_______________________________________________ users mailing list users@lists.open-mpi.org https://rfd.newmexicoconsortium.org/mailman/listinfo/users