Hi, opal_atomic_lifo implementation suffers from ABA problem. Here is the code for opal_atomic_lifo_pop:
1 do { 2 item = lifo->opal_lifo_head; 3 if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head), 4 item, 5 (void*)item->opal_list_next ) ) 6 break; 7 /* Do some kind of pause to release the bus */ 8 } while( 1 ); 9 if( item == &(lifo->opal_lifo_ghost) ) return NULL; 10 item->opal_list_next = NULL; 11 return item; If the following happens: Thread1: Thread2: 1 executes 2 2 executes 1-11 and acquire item 3 enters 3 but preempted before cmpxchg NOTE: the third parameter passed to cmpset is NULL because item is in use by thread2 4 executes lifo_push(item) 5 successfully executes cmpxchg since the old value is equal to current value (ABA problem) but places NULL into lifo->opal_lifo_head! Included patch seems to be fixing this problem, but I am not really sure if this is the right whay to solve this kind of problems. diff --git a/opal/class/opal_atomic_lifo.h b/opal/class/opal_atomic_lifo.h index caf35b1..4e8148c 100644 --- a/opal/class/opal_atomic_lifo.h +++ b/opal/class/opal_atomic_lifo.h @@ -71,8 +71,10 @@ static inline opal_list_item_t* opal_atomic_lifo_push( opal_atomic_lifo_t* lifo, item->opal_list_next = lifo->opal_lifo_head; if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head), (void*)item->opal_list_next, - item ) ) + item ) ) { + opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0); return (opal_list_item_t*)item->opal_list_next; + } /* DO some kind of pause to release the bus */ } while( 1 ); #else @@ -89,14 +91,17 @@ static inline opal_list_item_t* opal_atomic_lifo_pop( opal_atomic_lifo_t* lifo ) { opal_list_item_t* item; #if OMPI_HAVE_THREAD_SUPPORT - do { - item = lifo->opal_lifo_head; + while((item = lifo->opal_lifo_head) != &(lifo->opal_lifo_ghost)) + { + if(!opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 0, 1)) + continue; if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head), item, (void*)item->opal_list_next ) ) break; + opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0); /* Do some kind of pause to release the bus */ - } while( 1 ); + } #else item = lifo->opal_lifo_head; lifo->opal_lifo_head = (opal_list_item_t*)item->opal_list_next; diff --git a/opal/class/opal_list.c b/opal/class/opal_list.c index c8a5568..715715e 100644 --- a/opal/class/opal_list.c +++ b/opal/class/opal_list.c @@ -55,6 +55,7 @@ OBJ_CLASS_INSTANCE( static void opal_list_item_construct(opal_list_item_t *item) { item->opal_list_next = item->opal_list_prev = NULL; + item->item_free = 1; #if OMPI_ENABLE_DEBUG item->opal_list_item_refcount = 0; item->opal_list_item_belong_to = NULL; diff --git a/opal/class/opal_list.h b/opal/class/opal_list.h index 83fa57b..3a45f4e 100644 --- a/opal/class/opal_list.h +++ b/opal/class/opal_list.h @@ -102,6 +102,7 @@ struct opal_list_item_t /**< Pointer to next list item */ volatile struct opal_list_item_t *opal_list_prev; /**< Pointer to previous list item */ + int32_t item_free; #if OMPI_ENABLE_DEBUG /** Atomic reference count for debugging */ -- Gleb.