Re: queued spinlock code and results

Davide Libenzi Mon, 09 Jul 2007 12:03:31 -0700

On Sun, 8 Jul 2007, Nick Piggin wrote:

> I made some tests of the queued spinlock code using userspace test code on
> 64-bit processors. I believe the xadd based code no longer has any theoretical
> memory ordering problems.
> 
> The tests were done on 3 different architectures of different speeds and
> vintages (so not really comparable between columns). I have attached the two
> programs I used to make the results. Times are total elapsed time divided by
> the number of locks (so effectively you get the time required for 
> lock+unlock).
> 
> The threaded results also attempt to have an unfairness count, which is the
> max number of times in a row that a lock is acquired,  when all other threads
> are also executing in the loop -- the reason xadd for example is not always
> 0 there is because the other threads may not have reached the lock before
> the current thread was able to get it several times (eg. if an interrupt
> comes in, this could happen).


I was trying to look how costly is having a double-short counter, instead 
of a single byte.
The V-lock uses two short counters w/out the double-xadd optimization, 
while the Z-lock uses it.
This is a dual Opteron 252, but the test is done in single thread only 
(multi-thread really has no meaning for this test):

inc-lock in cache takes 7.28ns
xadd-lock in cache takes 8.93ns
vadd-lock in cache takes 10.35ns
zadd-lock in cache takes 8.43ns
inc-lock out of cache takes 87.85ns
xadd-lock out of cache takes 88.15ns
vadd-lock out of cache takes 89.38ns
zadd-lock out of cache takes 88.10ns

In this box, the always-lfence V-lock code pays for it. If I remove the 
lfence from the V-lock (but that's not possible), I get:

inc-lock in cache takes 7.28ns
xadd-lock in cache takes 8.93ns
vadd-lock in cache takes 7.67ns
zadd-lock in cache takes 8.44ns
inc-lock out of cache takes 87.87ns
xadd-lock out of cache takes 88.15ns
vadd-lock out of cache takes 88.24ns
zadd-lock out of cache takes 88.12ns

So in this box, and in this test, the double-short Z-lock seems faster 
than a double-byte. I've no idea why, since it uses two ops more and an 
extra register.




- Davide



#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>

struct page {
        short lock[2];
        unsigned int next;
};
#define NR_PAGES (2*1024*1024)
static struct page pages[NR_PAGES];

#define LOCK_INIT(l) (l)[0] = 1
static inline void lock(short *lock)
{
        __asm__ __volatile__ ("1:\n\t"
                              "lock ; decb %0\n\t"
                              "jns 2f\n\t"
                              "3:\n\t"
                              "rep ; nop\n\t"
                              "cmpb $0,%0\n\t"
                              "jle 3b\n\t"
                              "jmp 1b\n\t"
                              "2:\n\t"
                            : "+m" (*lock)
                            : : "memory");
}

static inline void unlock(short *lock)
{
        __asm__ __volatile__ ("movb $1,%0\n\t"
                            : "+m" (*lock)
                            : : "memory");
}

#define XLOCK_INIT(l) (l)[0] = 0
static inline void xlock(short *lock)
{
        short i = 0x0100;
        __asm__ __volatile__ ("lock ; xaddw %%ax, %1\n\t"
                              "1:\n\t"
                              "cmpb %%ah, %%al\n\t"
                              "je 2f\n\t"
                              "rep ; nop\n\t"
                              "movb %1, %%al\n\t"
                              "lfence\n\t"
                              "jmp 1b\n\t"
                              "2:\n\t"
                            : "+a" (i), "+m" (*lock)
                            : : "memory");
}

static inline void xunlock(short *lock)
{
        __asm__ __volatile__ ("incb %0\n\t"
                            : "+m" (*lock)
                            : : "memory");
}

#define VLOCK_INIT(l) (l)[0] = 0, (l)[1] = 0
static inline void vlock(short *lock)
{
        __asm__ __volatile__ ("lock ; xaddw %%ax, %0\n\t"
                              "1:\n\t"
                              "cmpw %%ax, %1\n\t"
                              "je 2f\n\t"
                              "rep ; nop\n\t"
                              "jmp 1b\n\t"
                              "2:\n\t"
                              "lfence\n\t"
                            : "+m" (lock[1])
                            : "m" (lock[0]), "a" (1) : "memory");
}

static inline void vunlock(short *lock)
{
        __asm__ __volatile__ ("incw %0\n\t"
                            : "+m" (lock[0])
                            : : "memory");
}

#define ZLOCK_INIT(l) (l)[0] = 0, (l)[1] = 0
static inline void zlock(short *lock)
{
        __asm__ __volatile__ ("lock ; xaddl %%eax, %0\n\t"
                              "mov %%eax, %%ebx\n\t"
                              "shr $16, %%ebx\n\t"
                              "1:\n\t"
                              "cmpw %%ax, %%bx\n\t"
                              "je 2f\n\t"
                              "rep ; nop\n\t"
                              "movw %1, %%bx\n\t"
                              "lfence\n\t"
                              "jmp 1b\n\t"
                              "2:\n\t"
                            : "+m" (*(int *) lock)
                            : "m" (lock[0]), "a" (0x10000) : "ebx", "memory");
}

static inline void zunlock(short *lock)
{
        __asm__ __volatile__ ("incw %0\n\t"
                            : "+m" (lock[0])
                            : : "memory");
}

static int xlock_is_locked(short *lock)
{
        short tmp = *lock;
        char *x = (char *)&tmp;

        return (*x != *(x+1));
}

#define ITERS (16*1024*1024)
#define IN_ITERS (ITERS*5)

int main(void)
{
        int nr_pages;
        int nr;
        int i;
        struct page *p;
        struct timeval start, end;
        unsigned long long usec;
        unsigned int tmp;

        nr_pages = 10;
        srandom(10);
        p = &pages[0];
        i = 0;
        nr = 0;
        while (nr < nr_pages-1) {
                unsigned int n;

                n = random() % NR_PAGES;
                while (p == &pages[n] || pages[n].next)
                        n = (n+1) % NR_PAGES;
                p->next = n;
                p = &pages[n];
                nr++;
        }
        p->next = 0;

        for (i = 0; i < NR_PAGES; i++)
                LOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < IN_ITERS; i++) {
                lock(&p->lock[0]);
                tmp = p->next;
                unlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("inc-lock in cache takes %0.2lfns\n", (double)usec * 1000 / 
IN_ITERS);

        for (i = 0; i < NR_PAGES; i++)
                XLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < IN_ITERS; i++) {
                xlock(&p->lock[0]);
                tmp = p->next;
                xunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("xadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / 
IN_ITERS);

        for (i = 0; i < NR_PAGES; i++)
                VLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < IN_ITERS; i++) {
                vlock(&p->lock[0]);
                tmp = p->next;
                vunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("vadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / 
IN_ITERS);

        for (i = 0; i < NR_PAGES; i++)
                ZLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < IN_ITERS; i++) {
                zlock(&p->lock[0]);
                tmp = p->next;
                zunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("zadd-lock in cache takes %0.2lfns\n", (double)usec * 1000 / 
IN_ITERS);
        
        for (i = 0; i < NR_PAGES; i++)
                pages[i].next = 0;

        nr_pages = NR_PAGES;
        srandom(10);
        p = &pages[0];
        i = 0;
        nr = 0;
        while (nr < nr_pages-1) {
                unsigned int n;

                n = random() % NR_PAGES;
                while (p == &pages[n] || pages[n].next)
                        n = (n+1) % NR_PAGES;
                p->next = n;
                p = &pages[n];
                nr++;
        }
        p->next = 0;

        for (i = 0; i < NR_PAGES; i++)
                LOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < ITERS; i++) {
                lock(&p->lock[0]);
                tmp = p->next;
                unlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("inc-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / 
ITERS);


        for (i = 0; i < NR_PAGES; i++)
                XLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < ITERS; i++) {
                xlock(&p->lock[0]);
                tmp = p->next;
                xunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("xadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / 
ITERS);

        for (i = 0; i < NR_PAGES; i++)
                VLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < ITERS; i++) {
                vlock(&p->lock[0]);
                tmp = p->next;
                vunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("vadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / 
ITERS);

        for (i = 0; i < NR_PAGES; i++)
                ZLOCK_INIT(pages[i].lock);
        gettimeofday(&start, NULL);
        p = &pages[0];
        for (i = 0; i < ITERS; i++) {
                zlock(&p->lock[0]);
                tmp = p->next;
                zunlock(&p->lock[0]);
                p = &pages[tmp];
        }
        gettimeofday(&end, NULL);
        usec = end.tv_usec + 1000000*(end.tv_sec - start.tv_sec) - 
start.tv_usec;
        printf("zadd-lock out of cache takes %0.2lfns\n", (double)usec * 1000 / 
ITERS);

        return 0;
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: queued spinlock code and results

Reply via email to