Re: [PATCH] Use __rw_atomic_xxx() on Windows

Martin Sebor Wed, 05 Sep 2007 19:49:24 -0700

Travis Vitek wrote:

Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
There is a performance penalty...


I'd be curious to know if the performance penalty is due to the
function call overhead or something else.

In any case though, I think we could tweak the patch and change
the __rw_atomic_pre{de,in}crement() overloads for int and long
to call the appropriate Interlocked{De,In}crement() intrinsics
and have the other overloads use the new ones.

Farid, what do you think about this approach?

Martin


  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               4266            4469
  ms/op      0.00003178      0.00003330      -4.7586%
  thr ms          18117           18437
  thr ms/op  0.00013498      0.00013737      -1.7663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               7969            8609
  ms/op      0.00005937      0.00006414      -8.0311%
  thr ms          36359           37019
  thr ms/op  0.00027090      0.00027581      -1.8152%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               5016            5484
  ms/op      0.00003737      0.00004086      -9.3301%
  thr ms          60846           66130
  thr ms/op  0.00045334      0.00049271      -8.6842%

  C:\Temp>t 2 && t 4 && t 8
  ---------- locked inc ---- atomic_add ---- 2 threads
  ms               2781            2906
  ms/op      0.00002072      0.00002165      -4.4948%
  thr ms          14961           16093
  thr ms/op  0.00011147      0.00011990      -7.5663%
  ---------- locked inc ---- atomic_add ---- 4 threads
  ms               2781            2891
  ms/op      0.00002072      0.00002154      -3.9554%
  thr ms          30867           31328
  thr ms/op  0.00022998      0.00023341      -1.4935%
  ---------- locked inc ---- atomic_add ---- 8 threads
  ms               2782            2890
  ms/op      0.00002073      0.00002153      -3.8821%
  thr ms          64318           64341
  thr ms/op  0.00047921      0.00047938      -0.0358%

I will do a quick run using the string performance test after lunch.
I'll report the results on that later. I've pasted the source for the
bulk of my test below. If someone wants the entire thing, let me know
and I'll provide everything.

Travis


Martin Sebor wrote:

Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows

What's the status of this? We need to decide if we can put this
in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
sure the new functions don't cause a performance regression in
basic_string. I.e., we need to see the before and after numbers.

Martin

Martin Sebor wrote:

One concern I have is performance. Does replacing the intrinsics with
out of line function call whose semantics the compiler has no idea

about have any impact on the runtime efficiency of the

generated code?

I would be especially interested in "real life" scenarios such as the
usage of the atomic operations in basic_string.

It would be good to see some before and after numbers. If you don't
have all the platforms to run the test post your benchmark and Travis
can help you put them together.


#include <stdio.h>
#include <stdlib.h>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <process.h>

#include "lib.h"

#define MIN_THREADS 2
#define MAX_THREADS 16

unsigned long locked_inc(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        InterlockedIncrement(val);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

unsigned long atomic_add(long* val, long iters)
{
    const unsigned long t0 = GetTickCount ();

    long n;
    for (n = 0; n < iters; ++n)
    {
        __rw_atomic_add32(val, 1);
    }

    const unsigned long t1 = GetTickCount ();

    return (t1 - t0);
}

struct thread_param {

    // atomic variable
    long* variable;

    // number of iterations
    long iters;

    // function to invoke
    unsigned long (*fun)(long*, long);

    // result of function
    unsigned long result;

    // thread handle used by main thread
    HANDLE thread;
};

extern "C" {

    void thread_func(void* p)
    {
        thread_param* param = (thread_param*)p;
        param->result = (param->fun)(param->variable, param->iters);
    }

} // extern "C"


unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
long), long iters)
{
    thread_param params[MAX_THREADS];
    long thread_var = 0;

    int i;
    for (i = 0; i < nthreads; ++i) {
        params[i].variable = &thread_var;
        params[i].result   = 0;
        params[i].fun      = fun;
        params[i].iters    = iters;
    }

    int n;
    for (n = 0; n < nthreads; ++n) {
        params[n].thread = (HANDLE)_beginthread(thread_func, 0,
&params[n]);
    }

    unsigned long thread_time = 0;

    for (n = 0; n < nthreads; ++n) {
        WaitForSingleObject (params[n].thread, INFINITE);
        thread_time += params[n].result;
    }

    return thread_time;
}


int main(int argc, char* argv[])
{
    int nthreads = MIN_THREADS;
    if (1 < argc)
        nthreads = atoi(argv[1]);

    // cap thread count
    if (nthreads < MIN_THREADS)
        nthreads = MIN_THREADS;
    else if (MAX_THREADS < nthreads)
        nthreads = MAX_THREADS;

    const long ops = 0x7ffffff;
    long thread_var;

thread_var = 0;

    unsigned long locked_inc_ms = locked_inc (&thread_var, ops);

thread_var = 0;

    unsigned long atomic_add_ms = atomic_add (&thread_var, ops);

    printf("---------- locked inc ---- atomic_add ---- %d threads\n",
nthreads);
    printf("ms           %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

printf("ms/op %8.8f %8.8f %.4f%%\n",locked_inc_ops_p_ms, atomic_add_ops_p_ms,

        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    // do it with threads

    locked_inc_ms = run_threads(nthreads, locked_inc, ops);
    atomic_add_ms = run_threads(nthreads, atomic_add, ops);

    locked_inc_ms /= nthreads;
    atomic_add_ms /= nthreads;

    printf("thr ms       %8.u        %8.u\n", locked_inc_ms,
atomic_add_ms);

    locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
    atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;

printf("thr ms/op %8.8f %8.8f %.4f%%\n",locked_inc_ops_p_ms, atomic_add_ops_p_ms,

        100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);

    return 0;
}

Re: [PATCH] Use __rw_atomic_xxx() on Windows

Reply via email to