Hi,
opal_atomic_lifo implementation suffers from ABA problem.
Here is the code for opal_atomic_lifo_pop:
1 do {
2 item = lifo->opal_lifo_head;
3 if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
4 item,
5 (void*)item->opal_list_next ) )
6 break;
7 /* Do some kind of pause to release the bus */
8 } while( 1 );
9 if( item == &(lifo->opal_lifo_ghost) ) return NULL;
10 item->opal_list_next = NULL;
11 return item;
If the following happens:
Thread1: Thread2:
1 executes 2
2 executes 1-11 and acquire
item
3 enters 3 but preempted before cmpxchg
NOTE: the third parameter passed to cmpset is
NULL because item is in use by thread2
4 executes lifo_push(item)
5 successfully executes cmpxchg since the old
value is equal to current value (ABA problem)
but places NULL into lifo->opal_lifo_head!
Included patch seems to be fixing this problem, but I am not really sure
if this is the right whay to solve this kind of problems.
diff --git a/opal/class/opal_atomic_lifo.h b/opal/class/opal_atomic_lifo.h
index caf35b1..4e8148c 100644
--- a/opal/class/opal_atomic_lifo.h
+++ b/opal/class/opal_atomic_lifo.h
@@ -71,8 +71,10 @@ static inline opal_list_item_t* opal_atomic_lifo_push(
opal_atomic_lifo_t* lifo,
item->opal_list_next = lifo->opal_lifo_head;
if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
(void*)item->opal_list_next,
- item ) )
+ item ) ) {
+ opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
return (opal_list_item_t*)item->opal_list_next;
+ }
/* DO some kind of pause to release the bus */
} while( 1 );
#else
@@ -89,14 +91,17 @@ static inline opal_list_item_t* opal_atomic_lifo_pop(
opal_atomic_lifo_t* lifo )
{
opal_list_item_t* item;
#if OMPI_HAVE_THREAD_SUPPORT
- do {
- item = lifo->opal_lifo_head;
+ while((item = lifo->opal_lifo_head) != &(lifo->opal_lifo_ghost))
+ {
+ if(!opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 0, 1))
+ continue;
if( opal_atomic_cmpset_ptr( &(lifo->opal_lifo_head),
item,
(void*)item->opal_list_next ) )
break;
+ opal_atomic_cmpset_32((volatile int32_t*)&item->item_free, 1, 0);
/* Do some kind of pause to release the bus */
- } while( 1 );
+ }
#else
item = lifo->opal_lifo_head;
lifo->opal_lifo_head = (opal_list_item_t*)item->opal_list_next;
diff --git a/opal/class/opal_list.c b/opal/class/opal_list.c
index c8a5568..715715e 100644
--- a/opal/class/opal_list.c
+++ b/opal/class/opal_list.c
@@ -55,6 +55,7 @@ OBJ_CLASS_INSTANCE(
static void opal_list_item_construct(opal_list_item_t *item)
{
item->opal_list_next = item->opal_list_prev = NULL;
+ item->item_free = 1;
#if OMPI_ENABLE_DEBUG
item->opal_list_item_refcount = 0;
item->opal_list_item_belong_to = NULL;
diff --git a/opal/class/opal_list.h b/opal/class/opal_list.h
index 83fa57b..3a45f4e 100644
--- a/opal/class/opal_list.h
+++ b/opal/class/opal_list.h
@@ -102,6 +102,7 @@ struct opal_list_item_t
/**< Pointer to next list item */
volatile struct opal_list_item_t *opal_list_prev;
/**< Pointer to previous list item */
+ int32_t item_free;
#if OMPI_ENABLE_DEBUG
/** Atomic reference count for debugging */
--
Gleb.