Index: configure.ac
===================================================================
--- configure.ac	(revision 16431)
+++ configure.ac	(working copy)
@@ -816,6 +816,24 @@

 OMPI_CHECK_BROKEN_QSORT

+AC_CACHE_CHECK([if word-sized integers must be word-size aligned],
+    [ompi_cv_c_word_size_align],
+    [AC_LANG_PUSH(C)
+     AC_RUN_IFELSE([AC_LANG_PROGRAM([dnl
+#include <stdlib.h>], [[    long data[2] = {0, 0};
+    long *lp;
+    int *ip;
+    ip = (int*) data;
+    ip++;
+    lp = (long*) ip;
+    return lp[0]; ]])],
+        [ompi_cv_c_word_size_align=no],
+        [ompi_cv_c_word_size_align=yes],
+        [ompi_cv_c_word_size_align=yes])])
+AS_IF([test $ompi_cv_c_word_size_align = yes], [results=1], [results=0])
+AC_DEFINE_UNQUOTED([OMPI_ALIGN_WORD_SIZE_INTEGERS], [$results],
+    [set to 1 if word-size integers must be aligned to word-size padding to prevent bus errors])
+
 # all: SYSV semaphores
 # all: SYSV shared memory
 # all: size of FD_SET
Index: ompi/mca/btl/gm/btl_gm.c
===================================================================
--- ompi/mca/btl/gm/btl_gm.c	(revision 16431)
+++ ompi/mca/btl/gm/btl_gm.c	(working copy)
@@ -716,7 +716,7 @@
     /* post the put descriptor */
     gm_put(gm_btl->port,
         des->des_src->seg_addr.pval,
-        des->des_dst->seg_addr.lval,
+        des->des_dst->seg_addr.pval,
         des->des_src->seg_len,
         GM_LOW_PRIORITY,
         endpoint->endpoint_addr.node_id,
@@ -758,7 +758,7 @@
     /* post the put descriptor */
     gm_put(gm_btl->port,
         des->des_src->seg_addr.pval,
-        des->des_dst->seg_addr.lval,
+        des->des_dst->seg_addr.pval,
         des->des_src->seg_len,
         GM_LOW_PRIORITY,
         endpoint->endpoint_addr.node_id,
@@ -861,7 +861,7 @@

     /* post get put descriptor */
     gm_get(gm_btl->port,
-        des->des_dst->seg_addr.lval,
+        des->des_dst->seg_addr.pval,
         des->des_src->seg_addr.pval,
         des->des_src->seg_len,
         GM_LOW_PRIORITY,
@@ -904,7 +904,7 @@

     /* post get put descriptor */
     gm_get(gm_btl->port,
-        des->des_dst->seg_addr.lval,
+        des->des_dst->seg_addr.pval,
         des->des_src->seg_addr.pval,
         des->des_src->seg_len,
         GM_LOW_PRIORITY,
Index: ompi/mca/pml/ob1/pml_ob1_recvfrag.c
===================================================================
--- ompi/mca/pml/ob1/pml_ob1_recvfrag.c	(revision 16431)
+++ ompi/mca/pml/ob1/pml_ob1_recvfrag.c	(working copy)
@@ -122,8 +122,7 @@
                 MCA_PML_OB1_ACK_HDR_NTOH(hdr->hdr_ack);
             }
 #endif
-            sendreq = (mca_pml_ob1_send_request_t*)
-                hdr->hdr_ack.hdr_src_req.pval;
+            sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
             sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
             sendreq->req_rdma_offset = (size_t)hdr->hdr_ack.hdr_rdma_offset;
             if(OPAL_THREAD_ADD32(&sendreq->req_state, 1) == 2 &&
@@ -144,8 +143,7 @@
                 MCA_PML_OB1_FRAG_HDR_NTOH(hdr->hdr_frag);
             }
 #endif
-            recvreq = (mca_pml_ob1_recv_request_t*)
-                hdr->hdr_frag.hdr_dst_req.pval;
+            recvreq = (mca_pml_ob1_recv_request_t*)hdr->hdr_frag.hdr_dst_req.pval;
             mca_pml_ob1_recv_request_progress(recvreq,btl,segments,des->des_dst_cnt);
             break;
         }
@@ -158,8 +156,7 @@
                we remember if we ever change the bml. */
             assert(0 == (hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NBO));
 #endif
-            sendreq = (mca_pml_ob1_send_request_t*)
-                hdr->hdr_rdma.hdr_req.pval;
+            sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_rdma.hdr_req.pval;
             mca_pml_ob1_send_request_put(sendreq,btl,&hdr->hdr_rdma);
             break;
         }
@@ -171,8 +168,7 @@
                 MCA_PML_OB1_FIN_HDR_NTOH(hdr->hdr_fin);
             }
 #endif
-            rdma = (mca_btl_base_descriptor_t*)
-                hdr->hdr_fin.hdr_des.pval;
+            rdma = (mca_btl_base_descriptor_t*)hdr->hdr_fin.hdr_des.pval;
             rdma->des_cbfunc(btl, NULL, rdma, OMPI_SUCCESS);
             break;
         }
Index: ompi/datatype/position.c
===================================================================
--- ompi/datatype/position.c	(revision 16431)
+++ ompi/datatype/position.c	(working copy)
@@ -53,7 +53,7 @@
                                              size_t* SPACE )
 {
     uint32_t _copy_count = *(COUNT);
-	size_t _copy_blength;
+    size_t _copy_blength;
     ddt_elem_desc_t* _elem = &((ELEM)->elem);

     _copy_blength =  ompi_ddt_basicDatatypes[_elem->common.type]->size;
@@ -109,7 +109,7 @@
     size_t iov_len_local;
     ptrdiff_t extent = pConvertor->pDesc->ub - pConvertor->pDesc->lb;

-    DUMP( "ompi_convertor_generic_simple_pack( %p, &%ld )\n", (void*)pConvertor, (long)*position );
+    DUMP( "ompi_convertor_generic_simple_position( %p, &%ld )\n", (void*)pConvertor, (long)*position );

     /* We dont want to have to parse the datatype multiple times. What we are interested in
      * here is to compute the number of completed datatypes that we can move forward, update
Index: ompi/datatype/datatype_pack.c
===================================================================
--- ompi/datatype/datatype_pack.c	(revision 16431)
+++ ompi/datatype/datatype_pack.c	(working copy)
@@ -71,7 +71,7 @@
         if( (size_t)iov[iov_count].iov_len > length )
             iov[iov_count].iov_len = length;
         if( iov[iov_count].iov_base == NULL ) {
-            iov[iov_count].iov_base = source_base;
+            iov[iov_count].iov_base = (IOVBASE_TYPE *) source_base;
             COMPUTE_CSUM( iov[iov_count].iov_base, iov[iov_count].iov_len, pConv );
         } else {
             /* contiguous data just memcpy the smallest data in the user buffer */
@@ -133,7 +133,7 @@
             if( (uint32_t)pStack->count < ((*out_size) - iov_count) ) {
                 pStack[1].count = pData->size - (pConv->bConverted % pData->size);
                 for( index = iov_count; i < pConv->count; i++, index++ ) {
-                    iov[index].iov_base = user_memory;
+                    iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
                     iov[index].iov_len = pStack[1].count;
                     pStack[0].disp += extent;
                     total_bytes_converted += pStack[1].count;
@@ -156,13 +156,13 @@
                 for( index = iov_count; (i < pConv->count) && (index < (*out_size));
                      i++, index++ ) {
                     if( max_allowed < pData->size ) {
-                        iov[index].iov_base = user_memory;
+                        iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
                         iov[index].iov_len = max_allowed;
                         max_allowed = 0;
                         COMPUTE_CSUM( iov[index].iov_base, iov[index].iov_len, pConv );
                         break;
                     } else {
-                        iov[index].iov_base = user_memory;
+                        iov[index].iov_base = (IOVBASE_TYPE *) user_memory;
                         iov[index].iov_len = pData->size;
                         user_memory += extent;
                         COMPUTE_CSUM( iov[index].iov_base, (size_t)iov[index].iov_len, pConv );
@@ -249,7 +249,6 @@
     dt_stack_t* pStack;       /* pointer to the position on the stack */
     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-    uint16_t type;            /* type at current position */
     size_t total_packed = 0;  /* total amount packed this time */
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -295,7 +294,6 @@
                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                     continue;
                 }
-                type = pElem->elem.common.type;
                 goto complete_loop;
             }
             if( DT_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -356,23 +354,6 @@
     *max_data = total_packed;
     *out_size = iov_count;

-#if 0
-    if( pConvertor->flags & CONVERTOR_WITH_CHECKSUM ) {
-        uint32_t ui1 = 0, ui2 = 0, csum = OPAL_CSUM_ZERO;
-        /**
-         * Check the checksum correctness.
-         */
-        for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-            csum += opal_uicsum_partial( iov[iov_count].iov_base, iov[iov_count].iov_len,
-                                         &ui1, &ui2 );
-        }
-        if( csum != pConvertor->checksum ) {
-            opal_output( 0, "error in the pack function the checksum does not match\n"
-                         "(%d != %d)\n", csum, pConvertor->checksum );
-        }
-    }
-#endif
-
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         return 1;
Index: ompi/datatype/datatype_unpack.c
===================================================================
--- ompi/datatype/datatype_unpack.c	(revision 16431)
+++ ompi/datatype/datatype_unpack.c	(working copy)
@@ -294,9 +294,9 @@
     uint32_t i, length, count_desc = 1;
     size_t data_length = ompi_ddt_basicDatatypes[pElem->elem.common.type]->size;

-    DO_DEBUG( opal_output( 0, "unpack partial data start %d end %d data_length %lu user %p\n"
+    DO_DEBUG( opal_output( 0, "unpack partial data start %lu end %lu data_length %lu user %p\n"
                            "\tbConverted %lu total_length %lu count %d\n",
-                           start_position, end_position, (unsigned long)data_length, *user_buffer,
+                           (unsigned long)start_position, (unsigned long)end_position, (unsigned long)data_length, *user_buffer,
                            (unsigned long)pConvertor->bConverted, (unsigned long)pConvertor->local_size, pConvertor->count ); );

     /* Find a byte that is not used in the partial buffer */
@@ -440,7 +440,7 @@
             if( DT_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
                 DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                                      (long)pStack->disp, (unsigned long)iov_len_local ); );
+                                       (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
                     if( pConvertor->stack_pos == 0 ) {
                         /* Force the conversion to stop by lowering the number of iovecs. */
Index: ompi/datatype/datatype_pack.h
===================================================================
--- ompi/datatype/datatype_pack.h	(revision 16431)
+++ ompi/datatype/datatype_pack.h	(working copy)
@@ -21,7 +21,7 @@
                                          size_t* SPACE )
 {
     uint32_t _copy_count = *(COUNT);
-	size_t _copy_blength;
+    size_t _copy_blength;
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     char* _source = (*SOURCE) + _elem->disp;

@@ -36,8 +36,8 @@
         /* the extent and the size of the basic datatype are equals */
         OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
-        DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %ld ) => space %lu\n",
-                               *(DESTINATION), _source, _copy_blength, (unsigned long)(*(SPACE)) ); );
+        DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
+                               *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
         MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
         _source        += _copy_blength;
         *(DESTINATION) += _copy_blength;
@@ -46,8 +46,8 @@
         for( _i = 0; _i < _copy_count; _i++ ) {
             OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (CONVERTOR)->pBaseBuf,
                                         (CONVERTOR)->pDesc, (CONVERTOR)->count );
-            DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %ld ) => space %lu\n",
-                                   *(DESTINATION), _source, _copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
+            DO_DEBUG( opal_output( 0, "pack 2. memcpy( %p, %p, %lu ) => space %lu\n",
+                                   *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
             MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
             *(DESTINATION) += _copy_blength;
             _source        += _elem->extent;
@@ -77,8 +77,8 @@
     for( _i = 0; _i < _copy_loops; _i++ ) {
         OMPI_DDT_SAFEGUARD_POINTER( _source, _end_loop->size, (CONVERTOR)->pBaseBuf,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
-        DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %ld ) => space %ld\n",
-                               *(DESTINATION), _source, _end_loop->size, *(SPACE) - _i * _end_loop->size ); );
+        DO_DEBUG( opal_output( 0, "pack 3. memcpy( %p, %p, %lu ) => space %lu\n",
+                               *(DESTINATION), _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); );
         MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) );
         *(DESTINATION) += _end_loop->size;
         _source        += _loop->extent;
Index: ompi/datatype/dt_add.c
===================================================================
--- ompi/datatype/dt_add.c	(revision 16431)
+++ ompi/datatype/dt_add.c	(working copy)
@@ -87,37 +87,48 @@
      */
     if( extent == -1 ) extent = (pdtAdd->ub - pdtAdd->lb);

+    /* handle special cases for DT_LB and DT_UB and their duplicate */
+    if( DT_LB == pdtAdd->id ) {
+        pdtBase->bdt_used |= (((uint64_t)1) << DT_LB);
+        if( pdtBase->flags & DT_FLAG_USER_LB ) {
+            pdtBase->lb = LMIN( pdtBase->lb, disp );
+        } else {
+            pdtBase->lb = disp;
+            pdtBase->flags |= DT_FLAG_USER_LB;
+        }
+        if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
+            pdtBase->flags &= ~DT_FLAG_NO_GAPS;
+        }
+        return OMPI_SUCCESS;
+    } else if( DT_UB == pdtAdd->id ) {
+        pdtBase->bdt_used |= (((uint64_t)1) << DT_UB);
+        if( pdtBase->flags & DT_FLAG_USER_UB ) {
+            pdtBase->ub = LMAX( pdtBase->ub, disp );
+        } else {
+            pdtBase->ub = disp;
+            pdtBase->flags |= DT_FLAG_USER_UB;
+        }
+        if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
+            pdtBase->flags &= ~DT_FLAG_NO_GAPS;
+        }
+        return OMPI_SUCCESS;
+    }
     if( pdtAdd->flags & DT_FLAG_PREDEFINED ) { /* add a basic datatype */
-        /* handle special cases for DT_LB and DT_UB */
-        if( pdtAdd == ompi_ddt_basicDatatypes[DT_LB] ) {
-            pdtBase->bdt_used |= (((uint64_t)1) << DT_LB);
-            if( pdtBase->flags & DT_FLAG_USER_LB ) {
-                pdtBase->lb = LMIN( pdtBase->lb, disp );
+        place_needed = (extent == (ptrdiff_t)pdtAdd->size ? 1 : 3);
+    } else {
+        place_needed = pdtAdd->desc.used;
+        if( count != 1 ) {
+            if( place_needed < (MAX_DT_COMPONENT_COUNT - 2) ) {
+                place_needed += 2;  /* for the loop markers */
             } else {
-                pdtBase->lb = disp;
-                pdtBase->flags |= DT_FLAG_USER_LB;
+                /* The data-type contain too many elements. We will be unable
+                 * to handle it, so let's just complain by now.
+                 */
+                opal_output( 0, "Too many elements in the datatype. The limit is %ud\n",
+                             MAX_DT_COMPONENT_COUNT );
+                return OMPI_ERROR;
             }
-            if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
-                pdtBase->flags &= ~DT_FLAG_NO_GAPS;
-            }
-            return OMPI_SUCCESS;
-        } else if( pdtAdd == ompi_ddt_basicDatatypes[DT_UB] ) {
-            pdtBase->bdt_used |= (((uint64_t)1) << DT_UB);
-            if( pdtBase->flags & DT_FLAG_USER_UB ) {
-                pdtBase->ub = LMAX( pdtBase->ub, disp );
-            } else {
-                pdtBase->ub = disp;
-                pdtBase->flags |= DT_FLAG_USER_UB;
-            }
-            if( (pdtBase->ub - pdtBase->lb) != (ptrdiff_t)pdtBase->size ) {
-                pdtBase->flags &= ~DT_FLAG_NO_GAPS;
-            }
-            return OMPI_SUCCESS;
         }
-        place_needed = (extent == (ptrdiff_t)pdtAdd->size ? 1 : 3);
-    } else {
-        place_needed = pdtAdd->desc.used;
-        if( count != 1 ) place_needed += 2;  /* for the loop markers */
     }

     /*
@@ -167,26 +178,20 @@
         ub = LMAX( pdtBase->ub, ub );
     }
     /* While the true_lb and true_ub have to be ordered to have the true_lb lower
-     * than the true_ub, the ub and lb does not have to be ordered. They should be
+     * than the true_ub, the ub and lb do not have to be ordered. They should be
      * as the user define them.
      */
     pdtBase->lb = lb;
     pdtBase->ub = ub;

-    if( 0 == pdtBase->nbElems ) old_true_ub = disp;
-    else                        old_true_ub = pdtBase->true_ub;
-    pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );
-    pdtBase->true_ub = LMAX( true_ub, pdtBase->true_ub );
-
     /* compute the new memory alignement */
     pdtBase->align = IMAX( pdtBase->align, pdtAdd->align );
-    pdtBase->size += count * pdtAdd->size;

     /* Now that we have the new ub and the alignment we should update the ub to match
-     * the new alignement. We have to add an epsilon that is the least nonnegative increment
-     * needed to roung the extent to the next multiple of the alignment. This rule
-     * apply only if there is user specified upper bound as stated in the MPI
-     * standard MPI 1.2 page 71.
+     * the new alignement. We have to add an epsilon that is the least nonnegative
+     * increment needed to roung the extent to the next multiple of the alignment.
+     * This rule apply only if there is user specified upper bound as stated in the
+     * MPI standard MPI 1.2 page 71.
      */
     if( !(pdtBase->flags & DT_FLAG_USER_UB) ) {
         epsilon = (pdtBase->ub - pdtBase->lb) % pdtBase->align;
@@ -194,19 +199,33 @@
             pdtBase->ub += (pdtBase->align - epsilon);
         }
     }
+    /* now we know it contain some data */
+    pdtBase->flags |= DT_FLAG_DATA;

     /*
-     * the count == 0 is LEGAL only for MPI_UB and MPI_LB. I accept it just as a nice way to set
-     * the soft UB for a data (without using a real UB marker). This approach can be used to
-     * create the subarray and darray datatype. However from the MPI level this function
-     * should never be called directly with a count set to 0.
-     * Adding a data-type with a size zero is legal but does not have to go through all the
-     * stuff below.
+     * the count == 0 is LEGAL only for MPI_UB and MPI_LB. Therefore we support it
+     * here in the upper part of this function. As an extension, the count set to
+     * zero can be used to reset the alignment of the data, but not for changing
+     * the true_lb and true_ub.
      */
     if( (0 == count) || (0 == pdtAdd->size) ) {
         return OMPI_SUCCESS;
     }

+    /* Now, once we know everything is fine and there are some bytes in
+     * the data-type we can update the size, true_lb and true_ub.
+     */
+    pdtBase->size += count * pdtAdd->size;
+    if( 0 == pdtBase->nbElems ) old_true_ub = disp;
+    else                        old_true_ub = pdtBase->true_ub;
+    if( 0 != pdtBase->size ) {
+        pdtBase->true_lb = LMIN( true_lb, pdtBase->true_lb );
+        pdtBase->true_ub = LMAX( true_ub, pdtBase->true_ub );
+    } else {
+        pdtBase->true_lb = true_lb;
+        pdtBase->true_ub = true_ub;
+    }
+
     pdtBase->bdt_used |= pdtAdd->bdt_used;
     newLength = pdtBase->desc.used + place_needed;
     if( newLength > pdtBase->desc.length ) {
Index: ompi/datatype/datatype_unpack.h
===================================================================
--- ompi/datatype/datatype_unpack.h	(revision 16431)
+++ ompi/datatype/datatype_unpack.h	(working copy)
@@ -21,7 +21,7 @@
                                            size_t* SPACE )               /* the space in the destination buffer */
 {
     uint32_t _copy_count = *(COUNT);
-	size_t _copy_blength;
+    size_t _copy_blength;
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     char* _destination = (*DESTINATION) + _elem->disp;

@@ -36,8 +36,8 @@
         /* the extent and the size of the basic datatype are equals */
         OMPI_DDT_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
-        DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %ld ) => space %lu\n",
-                               _destination, *(SOURCE), _copy_blength, (unsigned long)(*(SPACE)) ); );
+        DO_DEBUG( opal_output( 0, "unpack 1. memcpy( %p, %p, %lu ) => space %lu\n",
+                               _destination, *(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
         MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) );
         *(SOURCE)    += _copy_blength;
         _destination += _copy_blength;
@@ -46,8 +46,8 @@
         for( _i = 0; _i < _copy_count; _i++ ) {
             OMPI_DDT_SAFEGUARD_POINTER( _destination, _copy_blength, (CONVERTOR)->pBaseBuf,
                                         (CONVERTOR)->pDesc, (CONVERTOR)->count );
-            DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %ld ) => space %lu\n",
-                                   _destination, *(SOURCE), _copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
+            DO_DEBUG( opal_output( 0, "unpack 2. memcpy( %p, %p, %lu ) => space %lu\n",
+                                   _destination, *(SOURCE), (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
             MEMCPY_CSUM( _destination, *(SOURCE), _copy_blength, (CONVERTOR) );
             *(SOURCE)    += _copy_blength;
             _destination += _elem->extent;
@@ -77,8 +77,8 @@
     for( _i = 0; _i < _copy_loops; _i++ ) {
         OMPI_DDT_SAFEGUARD_POINTER( _destination, _end_loop->size, (CONVERTOR)->pBaseBuf,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
-        DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %ld ) => space %ld\n",
-                               _destination, *(SOURCE), _end_loop->size, *(SPACE) - _i * _end_loop->size ); );
+        DO_DEBUG( opal_output( 0, "unpack 3. memcpy( %p, %p, %lu ) => space %lu\n",
+                               _destination, *(SOURCE), (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); );
         MEMCPY_CSUM( _destination, *(SOURCE), _end_loop->size, (CONVERTOR) );
         *(SOURCE)    += _end_loop->size;
         _destination += _loop->extent;
Index: ompi/datatype/dt_sndrcv.c
===================================================================
--- ompi/datatype/dt_sndrcv.c	(revision 16431)
+++ ompi/datatype/dt_sndrcv.c	(working copy)
@@ -47,14 +47,10 @@
    uint32_t iov_count;
    size_t max_data;

-   /* First check if we really have something to do */
-   if (0 == rcount) {
-       if (0 == scount) {
-           return MPI_SUCCESS;
-       } else {
-           return MPI_ERR_TRUNCATE;
-       }
-   }
+    /* First check if we really have something to do */
+    if (0 == rcount) {
+        return ((0 == scount) ? MPI_SUCCESS : MPI_ERR_TRUNCATE);
+    }

    /* If same datatypes used, just copy. */
    if (sdtype == rdtype) {
Index: ompi/datatype/dt_copy.c
===================================================================
--- ompi/datatype/dt_copy.c	(revision 16431)
+++ ompi/datatype/dt_copy.c	(working copy)
@@ -60,8 +60,8 @@
         OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE),
                                     (DATATYPE), (TOTAL_COUNT) );
         /* the extent and the size of the basic datatype are equals */
-        DO_DEBUG( opal_output( 0, "copy 1. memcpy( %p, %p, %ld ) => space %lu\n",
-                               _destination, _source, _copy_blength, (unsigned long)(*(SPACE)) ); );
+        DO_DEBUG( opal_output( 0, "copy 1. memcpy( %p, %p, %lu ) => space %lu\n",
+                               _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
         MEMCPY( _destination, _source, _copy_blength );
         _source      += _copy_blength;
         _destination += _copy_blength;
@@ -70,8 +70,8 @@
         for( _i = 0; _i < _copy_count; _i++ ) {
             OMPI_DDT_SAFEGUARD_POINTER( _source, _copy_blength, (SOURCE_BASE),
                                         (DATATYPE), (TOTAL_COUNT) );
-            DO_DEBUG( opal_output( 0, "copy 2. memcpy( %p, %p, %ld ) => space %lu\n",
-                                   _destination, _source, _copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
+            DO_DEBUG( opal_output( 0, "copy 2. memcpy( %p, %p, %lu ) => space %lu\n",
+                                   _destination, _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE) - (_i * _copy_blength)) ); );
             MEMCPY( _destination, _source, _copy_blength );
             _source      += _elem->extent;
             _destination += _elem->extent;
@@ -106,8 +106,8 @@
         for( _i = 0; _i < _copy_loops; _i++ ) {
             OMPI_DDT_SAFEGUARD_POINTER( _source, _end_loop->size, (SOURCE_BASE),
                                         (DATATYPE), (TOTAL_COUNT) );
-            DO_DEBUG( opal_output( 0, "copy 3. memcpy( %p, %p, %ld ) => space %ld\n",
-                                   _destination, _source, _end_loop->size, *(SPACE) - _i * _end_loop->size ); );
+            DO_DEBUG( opal_output( 0, "copy 3. memcpy( %p, %p, %lu ) => space %lu\n",
+                                   _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size) ); );
             MEMCPY( _destination, _source, _end_loop->size );
             _source      += _loop->extent;
             _destination += _loop->extent;
Index: ompi/datatype/dt_create_dup.c
===================================================================
--- ompi/datatype/dt_create_dup.c	(revision 16431)
+++ ompi/datatype/dt_create_dup.c	(working copy)
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2006 The University of Tennessee and The University
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -60,7 +60,8 @@
             memcpy( pdt->opt_desc.desc, oldType->opt_desc.desc, desc_length * sizeof(dt_elem_desc_t) );
         }
     }
-    pdt->id  = 0;
+    pdt->id  = oldType->id;  /* preserve the default id. This allow us to
+                              * copy predefined types. */
     pdt->args = NULL;
     *newType = pdt;
     return OMPI_SUCCESS;
Index: ompi/datatype/dt_module.c
===================================================================
--- ompi/datatype/dt_module.c	(revision 16431)
+++ ompi/datatype/dt_module.c	(working copy)
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2006 The University of Tennessee and The University
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -127,7 +127,7 @@
 OMPI_DECLSPEC ompi_datatype_t ompi_mpi_wchar = INIT_BASIC_DATA( wchar_t, OMPI_ALIGNMENT_WCHAR, WCHAR, DT_FLAG_DATA_C );
 #else
 OMPI_DECLSPEC ompi_datatype_t ompi_mpi_wchar = INIT_UNAVAILABLE_DATA( WCHAR );
-#endif  /* FTMPI_HAVE_WCHAR_T */
+#endif  /* OMPI_ALIGNMENT_WCHAR */

 OMPI_DECLSPEC ompi_datatype_t ompi_mpi_cxx_bool = INIT_BASIC_DATA( bool, OMPI_ALIGNMENT_CXX_BOOL, CXX_BOOL, DT_FLAG_DATA_CPP );
 OMPI_DECLSPEC ompi_datatype_t ompi_mpi_logic = INIT_BASIC_FORTRAN_TYPE( DT_LOGIC, LOGIC, OMPI_SIZEOF_FORTRAN_LOGICAL, OMPI_ALIGNMENT_FORTRAN_LOGICAL, 0 );
@@ -411,7 +411,7 @@

         datatype->desc.length       = 1;
         datatype->desc.used         = 1;
-        /* By default the optimized descritption is the same as the default
+        /* By default the optimized description is the same as the default
          * description for predefined datatypes.
          */
         datatype->opt_desc          = datatype->desc;
@@ -639,6 +639,10 @@
             ompi_ddt_number_of_predefined_data = (ompi_mpi_##name).d_f_to_c_index; \
     }

+    /*
+     * This MUST match the order of ompi/include/mpif-common.h
+     * Any change will break binary compatability of Fortran programs.
+     */
     MOOG(datatype_null);
     MOOG(byte);
     MOOG(packed);
@@ -769,7 +773,7 @@
 static int _dump_data_flags( unsigned short usflags, char* ptr, size_t length )
 {
     if( length < 21 ) return 0;
-    sprintf( ptr, "-----------[---][---]" );  /* set everything to - */
+    snprintf( ptr, 21, "-----------[---][---]" );  /* set everything to - */
     if( usflags & DT_FLAG_DESTROYED )                ptr[0]  = 'd';
     if( usflags & DT_FLAG_COMMITED )                 ptr[1]  = 'c';
     if( usflags & DT_FLAG_CONTIGUOUS )               ptr[2]  = 'C';
@@ -811,13 +815,13 @@
 static int __dump_data_desc( dt_elem_desc_t* pDesc, int nbElems, char* ptr, size_t length )
 {
     int i;
-    size_t index = 0;
+    int32_t index = 0;

     for( i = 0; i < nbElems; i++ ) {
         index += _dump_data_flags( pDesc->elem.common.flags, ptr + index, length );
-        if( length <= index ) break;
+        if( length <= (size_t)index ) break;
         index += snprintf( ptr + index, length - index, "%15s ", ompi_ddt_basicDatatypes[pDesc->elem.common.type]->name );
-        if( length <= index ) break;
+        if( length <= (size_t)index ) break;
         if( DT_LOOP == pDesc->elem.common.type )
             index += snprintf( ptr + index, length - index, "%d times the next %d elements extent %d\n",
                                (int)pDesc->loop.loops, (int)pDesc->loop.items,
@@ -832,7 +836,7 @@
                                (int)pDesc->elem.extent, (long)(pDesc->elem.count * ompi_ddt_basicDatatypes[pDesc->elem.common.type]->size) );
         pDesc++;

-        if( length <= index ) break;
+        if( length <= (size_t)index ) break;
     }
     return index;
 }
@@ -840,7 +844,7 @@
 static inline int __dt_contain_basic_datatypes( const ompi_datatype_t* pData, char* ptr, size_t length )
 {
     int i;
-    size_t index = 0;
+    int32_t index = 0;
     uint64_t mask = 1;

     if( pData->flags & DT_FLAG_USER_LB ) index += snprintf( ptr, length - index, "lb " );
@@ -849,7 +853,7 @@
         if( pData->bdt_used & mask )
             index += snprintf( ptr + index, length - index, "%s ", ompi_ddt_basicDatatypes[i]->name );
         mask <<= 1;
-        if( length <= index ) break;
+        if( length <= (size_t)index ) break;
     }
     return index;
 }
Index: ompi/datatype/dt_destroy.c
===================================================================
--- ompi/datatype/dt_destroy.c	(revision 16431)
+++ ompi/datatype/dt_destroy.c	(working copy)
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2006 The University of Tennessee and The University
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -25,7 +25,7 @@
 {
     ompi_datatype_t* pData = *dt;

-    if( pData->flags & DT_FLAG_PREDEFINED )
+    if( (pData->flags & DT_FLAG_PREDEFINED) && (pData->super.obj_reference_count <= 1) )
         return OMPI_ERROR;

     OBJ_RELEASE( pData );
Index: ompi/datatype/datatype.h
===================================================================
--- ompi/datatype/datatype.h	(revision 16431)
+++ ompi/datatype/datatype.h	(working copy)
@@ -89,9 +89,16 @@

 typedef union dt_elem_desc dt_elem_desc_t;

+/**
+ * The number of supported entries in the data-type definition and the
+ * associated type.
+ */
+#define MAX_DT_COMPONENT_COUNT UINT_MAX
+typedef uint32_t opal_ddt_count_t;
+
 typedef struct dt_type_desc {
-    uint32_t          length;  /* the maximum number of elements in the description array */
-    uint32_t          used;    /* the number of used elements in the description array */
+    opal_ddt_count_t  length;  /* the maximum number of elements in the description array */
+    opal_ddt_count_t  used;    /* the number of used elements in the description array */
     dt_elem_desc_t*   desc;
 } dt_type_desc_t;

@@ -129,11 +136,11 @@
 OMPI_DECLSPEC OBJ_CLASS_DECLARATION( ompi_datatype_t );

 int ompi_ddt_register_params(void);
-int32_t ompi_ddt_init( void );
-int32_t ompi_ddt_finalize( void );
+OMPI_DECLSPEC int32_t ompi_ddt_init( void );
+OMPI_DECLSPEC int32_t ompi_ddt_finalize( void );
 ompi_datatype_t* ompi_ddt_create( int32_t expectedSize );
-int32_t ompi_ddt_commit( ompi_datatype_t** );
-int32_t ompi_ddt_destroy( ompi_datatype_t** );
+OMPI_DECLSPEC int32_t ompi_ddt_commit( ompi_datatype_t** );
+OMPI_DECLSPEC int32_t ompi_ddt_destroy( ompi_datatype_t** );
 static inline int32_t ompi_ddt_is_committed( const ompi_datatype_t* type )
 { return ((type->flags & DT_FLAG_COMMITED) == DT_FLAG_COMMITED); }
 static inline int32_t ompi_ddt_is_overlapped( const ompi_datatype_t* type )
Index: ompi/datatype/convertor.c
===================================================================
--- ompi/datatype/convertor.c	(revision 16431)
+++ ompi/datatype/convertor.c	(working copy)
@@ -499,7 +499,7 @@
 {
     /* Here I should check that the data is not overlapping */

-    convertor->flags      |= CONVERTOR_RECV;
+    convertor->flags |= CONVERTOR_RECV;

     OMPI_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );

@@ -535,7 +535,7 @@
                                  int32_t count,
                                  const void* pUserBuf )
 {
-    convertor->flags            |= CONVERTOR_SEND;
+    convertor->flags |= CONVERTOR_SEND;

     OMPI_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );

Index: ompi/datatype/dt_args.c
===================================================================
--- ompi/datatype/dt_args.c	(revision 16431)
+++ ompi/datatype/dt_args.c	(working copy)
@@ -1,9 +1,9 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
- * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2006 The University of Tennessee and The University
+ * Copyright (c) 2004-2007 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -45,6 +45,20 @@
 } ompi_ddt_args_t;

 /**
+ * Compute the next value which is a multiple of PWROF2. Works fine
+ * only for power of 2 alignements.
+ */
+#define ALIGN_INT_TO( VALUE, PWROF2 )                           \
+    do {                                                        \
+        int _align = (intptr_t)((PWROF2) - 1);                  \
+        int _val = (int)(VALUE) + _align;                       \
+        (VALUE) = (_val & (~_align));                           \
+    } while(0)
+
+#define CHECK_ALIGN_TO( VALUE, PWROF2 )                 \
+    assert( 0 == ((VALUE) & ((PWROF2) - 1)) );          \
+
+/**
  * Some architecture require that 64 bits pointers (to pointers) has to
  * be 64 bits aligned. As in the ompi_ddt_args_t structure we have 2 such
  * pointers and one to an array of ints, if we start by setting the 64
@@ -76,6 +90,7 @@
         pArgs->ref_count = 1;                                           \
         pArgs->total_pack_size = (4 + (IC)) * sizeof(int) +             \
             (AC) * sizeof(MPI_Aint) + (DC) * sizeof(int);               \
+        ALIGN_INT_TO( pArgs->total_pack_size, sizeof(MPI_Aint) );       \
         (PDATA)->args = (void*)pArgs;					\
         (PDATA)->packed_description = NULL;                             \
     } while(0)
@@ -207,6 +222,10 @@
              */
             OBJ_RETAIN( d[pos] );
             pArgs->total_pack_size += ((ompi_ddt_args_t*)d[pos]->args)->total_pack_size;
+            /* as total_pack_size is always aligned to MPI_Aint size their sum
+             * will be aligned to ...
+             */
+            CHECK_ALIGN_TO( pArgs->total_pack_size, sizeof(MPI_Aint) );
         }
     }
     return MPI_SUCCESS;
@@ -375,14 +394,14 @@
     if( datatype->flags & DT_FLAG_PREDEFINED ) {
         return sizeof(int) * 2;
     }
+    assert( NULL != (ompi_ddt_args_t*)datatype->args );
     return ((ompi_ddt_args_t*)datatype->args)->total_pack_size;
 }

 static inline int __ompi_ddt_pack_description( ompi_datatype_t* datatype,
                                                void** packed_buffer, int* next_index )
 {
-    int* position = (int*)*packed_buffer;
-    int local_index = 0, i;
+    int i, *position = (int*)*packed_buffer;
     ompi_ddt_args_t* args = (ompi_ddt_args_t*)datatype->args;
     char* next_packed = (char*)*packed_buffer;

@@ -393,23 +412,37 @@
     }
     /* For duplicated datatype we don't have to store all the information */
     if( MPI_COMBINER_DUP == args->create_type ) {
-        position[local_index++] = args->create_type;
-        position[local_index++] = args->d[0]->id;
+        position[0] = args->create_type;
+        position[1] = args->d[0]->id;
         return OMPI_SUCCESS;
     }
-    position[local_index++] = args->create_type;
-    position[local_index++] = args->ci;
-    position[local_index++] = args->ca;
-    position[local_index++] = args->cd;
-    memcpy( &(position[local_index]), args->i, sizeof(int) * args->ci );
-    next_packed += ( 4 + args->ci) * sizeof(int);
-    local_index += args->ci;
+    position[0] = args->create_type;
+    position[1] = args->ci;
+    position[2] = args->ca;
+    position[3] = args->cd;
+    next_packed += (4 * sizeof(int));
+    /* So far there are 4 integers in the array, so we're still 64 bits aligned
+     * if we suppose that the original buffer was 64 bits aligned.
+     *
+     * In order to solve issues with the Sparc 64 which require 64 bits pointers
+     * to be correctly aligned, we have to start adding the data in a smart way,
+     * just to keep everything as aligned as possible. Therefore, the first
+     * array we have to copy is the array of displacements, followed by the
+     * array of datatypes (both of them might be arrays of pointers) and then
+     * finally the array of counts.
+     */
     if( 0 < args->ca ) {
-        memcpy( &(position[local_index]), args->a, sizeof(MPI_Aint) * args->ca );
+        memcpy( next_packed, args->a, sizeof(MPI_Aint) * args->ca );
         next_packed += sizeof(MPI_Aint) * args->ca;
     }
     position = (int*)next_packed;
     next_packed += sizeof(int) * args->cd;
+
+    /* copy the aray of counts (32 bits aligned) */
+    memcpy( next_packed, args->i, sizeof(int) * args->ci );
+    next_packed += args->ci * sizeof(int);
+
+    /* copy the rest of the data */
     for( i = 0; i < args->cd; i++ ) {
         ompi_datatype_t* temp_data = args->d[i];
         if( temp_data->flags & DT_FLAG_PREDEFINED ) {
@@ -436,6 +469,8 @@
     if( NULL == datatype->packed_description ) {
         if( datatype->flags & DT_FLAG_PREDEFINED ) {
             datatype->packed_description = malloc( 2 * sizeof(int) );
+        } else if( NULL == args ) {
+            return OMPI_ERROR;
         } else {
             datatype->packed_description = malloc( args->total_pack_size );
         }
@@ -448,86 +483,114 @@

 static ompi_datatype_t*
 __ompi_ddt_create_from_packed_description( void** packed_buffer,
-                                           struct ompi_proc_t* remote_processor )
+                                           const struct ompi_proc_t* remote_processor )
 {
-    int* position = (int*)*packed_buffer;
+    int* position;
     ompi_datatype_t* datatype = NULL;
     ompi_datatype_t** array_of_datatype;
     MPI_Aint* array_of_disp;
     int* array_of_length;
-    int number_of_length, number_of_disp, number_of_datatype;
+    int number_of_length, number_of_disp, number_of_datatype, data_id;
     int create_type, i;
-    char* next_buffer = (char*)*packed_buffer;
+    char* next_buffer;
+    bool free_array_of_disp = false;
+
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
     bool need_swap = false;

-    if ((remote_processor->proc_arch & OMPI_ARCH_ISBIGENDIAN) != 
-        (ompi_proc_local()->proc_arch & OMPI_ARCH_ISBIGENDIAN)) {
-         need_swap = true;
+    if( (remote_processor->proc_arch ^ ompi_proc_local()->proc_arch) &
+        OMPI_ARCH_ISBIGENDIAN ) {
+        need_swap = true;
     }
 #endif

+    next_buffer = (char*)*packed_buffer;
+    position = (int*)next_buffer;
+
+    create_type = position[0];
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
     if (need_swap) {
-        create_type = opal_swap_bytes4(position[0]);
-    } else 
+        create_type = opal_swap_bytes4(create_type);
+    }
 #endif
-    {
-        create_type = position[0];
-    }
     if( MPI_COMBINER_DUP == create_type ) {
         /* there we have a simple predefined datatype */
+        data_id = position[1];
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
         if (need_swap) {
-            position[1] = opal_swap_bytes4(position[1]);
+            data_id = opal_swap_bytes4(data_id);
         }
 #endif
-        assert( position[1] < DT_MAX_PREDEFINED );
+        assert( data_id < DT_MAX_PREDEFINED );
         *packed_buffer = position + 2;
-        return (ompi_datatype_t*)ompi_ddt_basicDatatypes[position[1]];
+        return (ompi_datatype_t*)ompi_ddt_basicDatatypes[data_id];
     }
+
+    number_of_length   = position[1];
+    number_of_disp     = position[2];
+    number_of_datatype = position[3];
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
     if (need_swap) {
-        number_of_length   = opal_swap_bytes4(position[1]);
-        number_of_disp     = opal_swap_bytes4(position[2]);
-        number_of_datatype = opal_swap_bytes4(position[3]);
-    } else
+        number_of_length   = opal_swap_bytes4(number_of_length);
+        number_of_disp     = opal_swap_bytes4(number_of_disp);
+        number_of_datatype = opal_swap_bytes4(number_of_datatype);
+    }
 #endif
-    {
-        number_of_length   = position[1];
-        number_of_disp     = position[2];
-        number_of_datatype = position[3];
-    }
     array_of_datatype = (ompi_datatype_t**)malloc( sizeof(ompi_datatype_t*) *
                                                    number_of_datatype );
-#if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
-    if (need_swap) {
-        position[4] = opal_swap_bytes4(position[4]);
-    }
-#endif
-    array_of_length    = &(position[4]);
-    next_buffer += (4 + number_of_length) * sizeof(int);
-    array_of_disp      = (MPI_Aint*)next_buffer;
-    next_buffer += number_of_disp * sizeof(MPI_Aint);
-    position = (int*)next_buffer;
-    next_buffer += number_of_datatype * sizeof(int);
+    next_buffer += (4 * sizeof(int));  /* move after the header */
+
+    array_of_disp   = (MPI_Aint*)next_buffer;
+    next_buffer    += number_of_disp * sizeof(MPI_Aint);
+    /* the other datatypes */
+    position        = (int*)next_buffer;
+    next_buffer    += number_of_datatype * sizeof(int);
+    /* the array of lengths (32 bits aligned) */
+    array_of_length = (int*)next_buffer;
+    next_buffer    += (number_of_length * sizeof(int));
+
     for( i = 0; i < number_of_datatype; i++ ) {
+        data_id = position[i];
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
         if (need_swap) {
-            position[i] = opal_swap_bytes4(position[i]);
+            data_id = opal_swap_bytes4(data_id);
         }
 #endif
-        if( position[i] < DT_MAX_PREDEFINED ) {
-            assert( position[i] < DT_MAX_PREDEFINED );
-            array_of_datatype[i] = (ompi_datatype_t*)ompi_ddt_basicDatatypes[position[i]];
-        } else {
-            array_of_datatype[i] =
-                __ompi_ddt_create_from_packed_description( (void**)&next_buffer,
-                                                           remote_processor );
-            if( NULL == array_of_datatype[i] )
-                goto cleanup_and_exit;
+        if( data_id < DT_MAX_PREDEFINED ) {
+            array_of_datatype[i] = (ompi_datatype_t*)ompi_ddt_basicDatatypes[data_id];
+            continue;
         }
+        array_of_datatype[i] =
+            __ompi_ddt_create_from_packed_description( (void**)&next_buffer,
+                                                       remote_processor );
+        if( NULL == array_of_datatype[i] ) {
+            /* don't cleanup more than required. We can now modify these
+             * values as we already know we have failed to rebuild the
+             * datatype.
+             */
+            array_of_datatype[i] = (ompi_datatype_t*)ompi_ddt_basicDatatypes[DT_BYTE];
+            number_of_datatype = i;
+            goto cleanup_and_exit;
+        }
     }
+
+#if OMPI_ALIGN_WORD_SIZE_INTEGERS
+    /**
+     * some architectures really don't like having unaligned
+     * accesses.  We'll be int aligned, because any sane system will
+     * require that.  But we might not be long aligned, and some
+     * architectures will complain if a long is accessed on int
+     * alignment (but not long alignment).  On those architectures,
+     * copy the buffer into an aligned buffer first.
+     */
+    if( 0 != number_of_disp ) {
+        char* ptr = array_of_disp;
+        free_array_of_disp = true;
+        array_of_disp = malloc(sizeof(MPI_Aint) * number_of_disp);
+        memcpy(array_of_disp, ptr, sizeof(MPI_Aint) * number_of_disp);
+    }
+#endif
+
 #if OMPI_ENABLE_HETEROGENEOUS_SUPPORT
     if (need_swap) {
         for (i = 0 ; i < number_of_length ; ++i) {
@@ -537,7 +600,7 @@
 #if SIZEOF_PTRDIFF_T == 4
             array_of_disp[i] = opal_swap_bytes4(array_of_disp[i]);
 #elif SIZEOF_PTRDIFF_T == 8
-            array_of_disp[i] = opal_swap_bytes8(array_of_disp[i]);
+            array_of_disp[i] = (MPI_Aint)opal_swap_bytes8(array_of_disp[i]);
 #else
 #error "Unknown size of ptrdiff_t"
 #endif
@@ -553,6 +616,7 @@
             OBJ_RELEASE(array_of_datatype[i]);
         }
     }
+    if (free_array_of_disp) free(array_of_disp);
     free( array_of_datatype );
     return datatype;
 }
Index: ompi/datatype/convertor.h
===================================================================
--- ompi/datatype/convertor.h	(revision 16431)
+++ ompi/datatype/convertor.h	(working copy)
@@ -63,7 +63,7 @@
 struct ompi_convertor_master_t;

 typedef struct dt_stack {
-    int16_t   index;    /**< index in the element description */
+    int32_t   index;    /**< index in the element description */
     int16_t   type;     /**< the type used for the last pack/unpack (original or DT_BYTE) */
     size_t    count;    /**< number of times we still have to do it */
     ptrdiff_t disp;     /**< actual displacement depending on the count field */
@@ -204,6 +204,7 @@
     convertor->remoteArch = pSrcConv->remoteArch;
     convertor->flags      = (pSrcConv->flags | flags);
     convertor->master     = pSrcConv->master;
+
     return ompi_convertor_prepare_for_send( convertor, datatype, count, pUserBuf );
 }

@@ -261,8 +262,7 @@

     if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) &&
         (convertor->flags & DT_FLAG_NO_GAPS) &&
-        ((convertor->flags & CONVERTOR_SEND) ||
-         (convertor->flags & CONVERTOR_HOMOGENEOUS)) ) {
+        (convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) {
         /* Contiguous and no checkpoint and no homogeneous unpack */
         convertor->bConverted = *position;
         return OMPI_SUCCESS;
Index: ompi/datatype/dt_optimize.c
===================================================================
--- ompi/datatype/dt_optimize.c	(revision 16431)
+++ ompi/datatype/dt_optimize.c	(working copy)
@@ -279,8 +279,8 @@
         dt_elem_desc_t* pElem = pData->desc.desc;

         index = GET_FIRST_NON_LOOP( pElem );
-        assert( pData->desc.desc[index].elem.common.flags & DT_FLAG_DATA );
-        first_elem_disp = pData->desc.desc[index].elem.disp;
+        assert( pElem[index].elem.common.flags & DT_FLAG_DATA );
+        first_elem_disp = pElem[index].elem.disp;
     }

     /* let's add a fake element at the end just to avoid useless comparaisons
Index: ompi/datatype/datatype_prototypes.h
===================================================================
--- ompi/datatype/datatype_prototypes.h	(revision 16431)
+++ ompi/datatype/datatype_prototypes.h	(working copy)
@@ -15,51 +15,51 @@

 #include "ompi_config.h"

-OMPI_DECLSPEC int32_t
+int32_t
 ompi_pack_homogeneous_contig( ompi_convertor_t* pConv,
                           struct iovec* iov, uint32_t* out_size,
                           size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_pack_homogeneous_contig_checksum( ompi_convertor_t* pConv,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_pack_homogeneous_contig_with_gaps( ompi_convertor_t* pConv,
                                     struct iovec* iov, uint32_t* out_size,
                                     size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_pack_homogeneous_contig_with_gaps_checksum( ompi_convertor_t* pConv,
                                              struct iovec* iov, uint32_t* out_size,
                                              size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_generic_simple_pack( ompi_convertor_t* pConvertor,
                           struct iovec* iov, uint32_t* out_size,
                           size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_generic_simple_pack_checksum( ompi_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_unpack_general( ompi_convertor_t* pConvertor,
                      struct iovec* iov, uint32_t* out_size,
                      size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_unpack_general_checksum( ompi_convertor_t* pConvertor,
                               struct iovec* iov, uint32_t* out_size,
                               size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_unpack_homogeneous_contig( ompi_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_unpack_homogeneous_contig_checksum( ompi_convertor_t* pConv,
                                          struct iovec* iov, uint32_t* out_size,
                                          size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_generic_simple_unpack( ompi_convertor_t* pConvertor,
                             struct iovec* iov, uint32_t* out_size,
                             size_t* max_data );
-OMPI_DECLSPEC int32_t
+int32_t
 ompi_generic_simple_unpack_checksum( ompi_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
