Hi George,

Thank you for attending the meeting at Kyoto. As we talked
at the meeting, my colleague suffers from a datatype problem.

See attached create_resized.c. It creates a datatype with an
LB marker using MPI_Type_create_struct and MPI_Type_create_resized.

Expected contents of the output file (received_data) is:
--------------------------------
0: t1 = 0.1, t2 = 0.2
1: t1 = 1.1, t2 = 1.2
2: t1 = 2.1, t2 = 2.2
3: t1 = 3.1, t2 = 3.2
4: t1 = 4.1, t2 = 4.2
... snip ...
1995: t1 = 1995.1, t2 = 1995.2
1996: t1 = 1996.1, t2 = 1996.2
1997: t1 = 1997.1, t2 = 1997.2
1998: t1 = 1998.1, t2 = 1998.2
1999: t1 = 1999.1, t2 = 1999.2
--------------------------------

But if you run the program many times with multiple BTL modules
and with their small eager_limit and small max_send_size,
you'll see on some run:
--------------------------------
0: t1 = 0.1, t2 = 0.2
1: t1 = 1.1, t2 = 1.2
2: t1 = 2.1, t2 = 2.2
3: t1 = 3.1, t2 = 3.2
4: t1 = 4.1, t2 = 4.2
... snip ...
470: t1 = 470.1, t2 = 470.2
471: t1 = 471.1, t2 = 471.2
472: t1 = 472.1, t2 = 472.2
473: t1 = 473.1, t2 = 473.2
474: t1 = 474.1, t2 = 0        <-- broken!
475: t1 = 0, t2 = 475.1
476: t1 = 0, t2 = 476.1
477: t1 = 0, t2 = 477.1
... snip ...
1995: t1 = 0, t2 = 1995.1
1996: t1 = 0, t2 = 1996.1
1997: t1 = 0, t2 = 1997.1
1998: t1 = 0, t2 = 1998.1
1999: t1 = 0, t2 = 1999.1
--------------------------------

The index of the array at which data start to break (474 in the
above case) may change on every run.
Same result appears on both trunk and v1.8.3.

You can reproduce this with the following options if you have
multiple IB HCAs.

  -n 2
  --mca btl self,openib
  --mca btl_openib_eager_limit 256
  --mca btl_openib_max_send_size 384

Or if you don't have multiple NICs, with the following options.

  -n 2
  --host localhost
  --mca btl self,sm,vader
  --mca btl_vader_exclusivity 65536
  --mca btl_vader_eager_limit 256
  --mca btl_vader_max_send_size 384
  --mca btl_sm_exclusivity 65536
  --mca btl_sm_eager_limit 256
  --mca btl_sm_max_send_size 384

My colleague found that OPAL convertor on the receiving process
seems to add the LB value twice for out-of-order arrival of
fragments when computing the receive buffer write-offset.

He created the patch bellow. Our program works fine with
this patch but we don't know this is a correct fix.
Could you see this issue?

Index: opal/datatype/opal_convertor.c
===================================================================
--- opal/datatype/opal_convertor.c      (revision 32807)
+++ opal/datatype/opal_convertor.c      (working copy)
@@ -362,11 +362,11 @@
     if( OPAL_LIKELY(0 == count) ) {
         pStack[1].type     = pElems->elem.common.type;
         pStack[1].count    = pElems->elem.count;
-        pStack[1].disp     = pElems->elem.disp;
+        pStack[1].disp     = 0;
     } else {
         pStack[1].type  = OPAL_DATATYPE_UINT1;
         pStack[1].count = pData->size - count;
-        pStack[1].disp  = pData->true_lb + count;
+        pStack[1].disp  = count;
     }
     pStack[1].index    = 0;  /* useless */


Best regards,
Takahiro Kawashima,
MPI development team,
Fujitsu
/* np=2 */

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>

struct structure {
    double not_transfered;
    double transfered_1;
    double transfered_2;
};

int main(int argc, char *argv[])
{
    int i, n = 2000, myrank;
    struct structure *data;
    MPI_Datatype struct_type, temp_type;
    MPI_Datatype types[2] = {MPI_DOUBLE, MPI_DOUBLE};
    int blocklens[2] = {1, 1};
    MPI_Aint disps[3];

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    data = malloc(sizeof(data[0]) * n);

    if (myrank == 0) {
        for (i = 0; i < n; i++) {
            data[i].transfered_1 = i + 0.1;
            data[i].transfered_2 = i + 0.2;
        }
    }

    MPI_Get_address(&data[0].transfered_1, &disps[0]);
    MPI_Get_address(&data[0].transfered_2, &disps[1]);
    MPI_Get_address(&data[0], &disps[2]);
    disps[1] -= disps[2]; /*  8 */
    disps[0] -= disps[2]; /* 16 */
    MPI_Type_create_struct(2, blocklens, disps, types, &temp_type);
    MPI_Type_create_resized(temp_type, 0, sizeof(data[0]), &struct_type);
    MPI_Type_commit(&struct_type);

    if (myrank == 0) {
        MPI_Send(data, n, struct_type, 1, 0, MPI_COMM_WORLD);
    } else if (myrank == 1) {
        MPI_Recv(data, n, struct_type, 0, 0, MPI_COMM_WORLD,
                 MPI_STATUS_IGNORE);
    }

    MPI_Type_free(&temp_type);
    MPI_Type_free(&struct_type);

    if (myrank == 1) {
        FILE *fp;
        fp = fopen("received_data", "w");
        for (i = 0; i < n; i++) {
            fprintf(fp, "%d: t1 = %g, t2 = %g\n",
                    i, data[i].transfered_1, data[i].transfered_2);
        }
        fclose(fp);
    }

    free(data);
    MPI_Finalize();

    return 0;
}
Index: opal/datatype/opal_convertor.c
===================================================================
--- opal/datatype/opal_convertor.c	(revision 32807)
+++ opal/datatype/opal_convertor.c	(working copy)
@@ -362,11 +362,11 @@
     if( OPAL_LIKELY(0 == count) ) {
         pStack[1].type     = pElems->elem.common.type;
         pStack[1].count    = pElems->elem.count;
-        pStack[1].disp     = pElems->elem.disp;
+        pStack[1].disp     = 0;
     } else {
         pStack[1].type  = OPAL_DATATYPE_UINT1;
         pStack[1].count = pData->size - count;
-        pStack[1].disp  = pData->true_lb + count;
+        pStack[1].disp  = count;
     }
     pStack[1].index    = 0;  /* useless */
 

Reply via email to