When DEBUG_SPINLOCK is turned off the size of a spinlock_t will decrease to
4 bytes. This results in the runqueue_lock (spinlock_t) and the tasklist_lock
(rwlock_t) being allocated on the same cache line and results in a lot of
contention for this single cache line. The symptoms of the problem were that
the machine (dual 667 mhz CPUs) would somtimes "hang" for several minutes
during boot and occaisonally during shutdown as well. After looking at several
dumps of the machine in this state, I noticed it always involved one process
doing a wait(2) for a zombie child which had exited.
The parent task was always in the following loop in the release() function:
do {
spin_lock_irq(&runqueue_lock);
has_cpu = p->has_cpu;
spin_unlock_irq(&runqueue_lock);
} while (has_cpu);
while the child was in various stages of disassociate_ctty(), but was always
trying to lock / unlock the tasklist_lock.
The cause of the problem is that the parent's ldl_l / stl_c for the
runqueue_lock was constantly interfering with the child's ldl_l / stl_c for the
tasklist_lock.
A quick work around for this particular instance would be to simply ensure
these locks do not land on the same cacheline by changing their declaration
in sched.c as follows:
--- /sw/linux-2.3.99-pre9/kernel/sched.c Tue May 2 19:49:58 2000
+++ sched.c Fri Jun 9 12:51:54 2000
@@ -60,8 +60,12 @@
* The run-queue lock locks the parts that actually access
* and change the run-queues, and have to be interrupt-safe.
*/
-spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second */
-rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */
+/*
+ * These can be hot locks so put them on their own cache line, we don't
+ * want these in the same cacheline.
+ */
+__cacheline_aligned spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* second
*/
+__cacheline_aligned rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* third */
static LIST_HEAD(runqueue_head);
You could argue these two locks warrant being on their own cache line just
to avoid "ping-ponging" a single cache line around the system, but this does
not prevent the same issue from occuring for a different set of locks.
Another approach is to modify the spinlock and read_lock macros so they will
wait a random period of time if the store conditional fails. The following
patch shows one possible implementation. With this code we will back off when
the store conditional fails by waiting for the cycle counter to advance a
certain amount. The logic does not really effect the main line path and only
happens when the store conditional fails, as opposed to the lock actually
being unavailable. To account for varying cycle frequencies we increase the
backoff if the previous waiting period was insufficient.
All ldl_l / stl_c sequences are susceptible to the same error, but I held off
changing those until I got some feedback on this approach. Do you see any
problems with it? Is there a better solution?
Thanks,
Pat
--
Patrick O'Rourke
[EMAIL PROTECTED]
P.S. This patch is based on linux-2.4.0-test1-ac21:
--- spinlock.h-orig Mon Jun 19 14:06:09 2000
+++ spinlock.h Mon Jun 19 14:04:05 2000
@@ -7,6 +7,7 @@
#define DEBUG_SPINLOCK 1
#define DEBUG_RWLOCK 1
+#define SPINLOCK_BACKOFF 12
/*
* Simple spin lock operations. There are two variants, one clears IRQ's
@@ -72,19 +73,29 @@
of this object file's text section so as to perfect
branch prediction. */
__asm__ __volatile__(
+ " bis $31,%3,$22\n"
"1: ldl_l %0,%1\n"
" blbs %0,2f\n"
" or %0,1,%0\n"
" stl_c %0,%1\n"
- " beq %0,2f\n"
+ " beq %0,3f\n"
" mb\n"
".subsection 2\n"
"2: ldl %0,%1\n"
" blbs %0,2b\n"
" br 1b\n"
+ "3: rpcc $23\n"
+ " srl $23,$22,$23\n"
+ "4: rpcc %0\n"
+ " srl %0,$22,%0\n"
+ " xor $23,%0,%0\n"
+ " blbc %0,4b\n"
+ " addq $22,1,$22\n"
+ " br 1b\n"
".previous"
: "=r" (tmp), "=m" (__dummy_lock(lock))
- : "m"(__dummy_lock(lock)));
+ : "m"(__dummy_lock(lock)), "i" (SPINLOCK_BACKOFF)
+ : "$22", "$23");
}
#define spin_trylock(lock) (!test_and_set_bit(0,(lock)))
@@ -108,20 +119,29 @@
long regx;
__asm__ __volatile__(
+ " bis $31,%3,$22\n"
"1: ldl_l %1,%0\n"
" bne %1,6f\n"
" or $31,1,%1\n"
" stl_c %1,%0\n"
- " beq %1,6f\n"
+ " beq %1,7f\n"
" mb\n"
".subsection 2\n"
"6: ldl %1,%0\n"
" bne %1,6b\n"
" br 1b\n"
+ "7: rpcc $23\n"
+ " srl $23,$22,$23\n"
+ "8: rpcc %1\n"
+ " srl %1,$22,%1\n"
+ " xor $23,%1,%1\n"
+ " blbc %1,8b\n"
+ " addq $22,1,$22\n"
+ " br 1b\n"
".previous"
: "=m" (__dummy_lock(lock)), "=&r" (regx)
- : "0" (__dummy_lock(lock))
- );
+ : "0" (__dummy_lock(lock)), "i" (SPINLOCK_BACKOFF)
+ : "$22", "$23");
}
static inline void read_lock(rwlock_t * lock)
@@ -129,20 +149,29 @@
long regx;
__asm__ __volatile__(
+ " bis $31,%3,$22\n"
"1: ldl_l %1,%0\n"
" blbs %1,6f\n"
" subl %1,2,%1\n"
" stl_c %1,%0\n"
- " beq %1,6f\n"
+ " beq %1,7f\n"
"4: mb\n"
".subsection 2\n"
"6: ldl %1,%0\n"
" blbs %1,6b\n"
" br 1b\n"
+ "7: rpcc $23\n"
+ " srl $23,$22,$23\n"
+ "8: rpcc %1\n"
+ " srl %1,$22,%1\n"
+ " xor $23,%1,%1\n"
+ " blbc %1,8b\n"
+ " addq $22,1,$22\n"
+ " br 1b\n"
".previous"
: "=m" (__dummy_lock(lock)), "=&r" (regx)
- : "m" (__dummy_lock(lock))
- );
+ : "m" (__dummy_lock(lock)), "i" (SPINLOCK_BACKOFF)
+ : "$22", "$23");
}
#endif /* DEBUG_RWLOCK */
@@ -156,15 +185,24 @@
{
long regx;
__asm__ __volatile__(
+ " bis $31,%3,$22\n"
"1: ldl_l %1,%0\n"
" addl %1,2,%1\n"
" stl_c %1,%0\n"
" beq %1,6f\n"
".subsection 2\n"
- "6: br 1b\n"
+ "6: rpcc $23\n"
+ " srl $23,$22,$23\n"
+ "7: rpcc %1\n"
+ " srl %1,$22,%1\n"
+ " xor $23,%1,%1\n"
+ " blbc %1,7b\n"
+ " addq $22,1,$22\n"
+ " br 1b\n"
".previous"
: "=m" (__dummy_lock(lock)), "=&r" (regx)
- : "m" (__dummy_lock(lock)));
+ : "m" (__dummy_lock(lock)), "i" (SPINLOCK_BACKOFF)
+ : "$22", "$23");
}
#endif /* _ALPHA_SPINLOCK_H */