Travis Vitek wrote:
Oh, yeah. that is the other thing that I did Friday. I wrote a testcase
to compare __rw_atomic_add32() against InterlockedIncrement() on Win32.
There is a performance penalty...
I'd be curious to know if the performance penalty is due to the
function call overhead or something else.
In any case though, I think we could tweak the patch and change
the __rw_atomic_pre{de,in}crement() overloads for int and long
to call the appropriate Interlocked{De,In}crement() intrinsics
and have the other overloads use the new ones.
Farid, what do you think about this approach?
Martin
C:\Temp>t 2 && t 4 && t 8
---------- locked inc ---- atomic_add ---- 2 threads
ms 4266 4469
ms/op 0.00003178 0.00003330 -4.7586%
thr ms 18117 18437
thr ms/op 0.00013498 0.00013737 -1.7663%
---------- locked inc ---- atomic_add ---- 4 threads
ms 7969 8609
ms/op 0.00005937 0.00006414 -8.0311%
thr ms 36359 37019
thr ms/op 0.00027090 0.00027581 -1.8152%
---------- locked inc ---- atomic_add ---- 8 threads
ms 5016 5484
ms/op 0.00003737 0.00004086 -9.3301%
thr ms 60846 66130
thr ms/op 0.00045334 0.00049271 -8.6842%
C:\Temp>t 2 && t 4 && t 8
---------- locked inc ---- atomic_add ---- 2 threads
ms 2781 2906
ms/op 0.00002072 0.00002165 -4.4948%
thr ms 14961 16093
thr ms/op 0.00011147 0.00011990 -7.5663%
---------- locked inc ---- atomic_add ---- 4 threads
ms 2781 2891
ms/op 0.00002072 0.00002154 -3.9554%
thr ms 30867 31328
thr ms/op 0.00022998 0.00023341 -1.4935%
---------- locked inc ---- atomic_add ---- 8 threads
ms 2782 2890
ms/op 0.00002073 0.00002153 -3.8821%
thr ms 64318 64341
thr ms/op 0.00047921 0.00047938 -0.0358%
I will do a quick run using the string performance test after lunch.
I'll report the results on that later. I've pasted the source for the
bulk of my test below. If someone wants the entire thing, let me know
and I'll provide everything.
Travis
Martin Sebor wrote:
Subject: Re: [PATCH] Use __rw_atomic_xxx() on Windows
What's the status of this? We need to decide if we can put this
in 4.2 or defer it for 4.2.1. To put it in 4.2 we need to make
sure the new functions don't cause a performance regression in
basic_string. I.e., we need to see the before and after numbers.
Martin
Martin Sebor wrote:
One concern I have is performance. Does replacing the intrinsics with
out of line function call whose semantics the compiler has no idea
about have any impact on the runtime efficiency of the
generated code?
I would be especially interested in "real life" scenarios such as the
usage of the atomic operations in basic_string.
It would be good to see some before and after numbers. If you don't
have all the platforms to run the test post your benchmark and Travis
can help you put them together.
#include <stdio.h>
#include <stdlib.h>
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <process.h>
#include "lib.h"
#define MIN_THREADS 2
#define MAX_THREADS 16
unsigned long locked_inc(long* val, long iters)
{
const unsigned long t0 = GetTickCount ();
long n;
for (n = 0; n < iters; ++n)
{
InterlockedIncrement(val);
}
const unsigned long t1 = GetTickCount ();
return (t1 - t0);
}
unsigned long atomic_add(long* val, long iters)
{
const unsigned long t0 = GetTickCount ();
long n;
for (n = 0; n < iters; ++n)
{
__rw_atomic_add32(val, 1);
}
const unsigned long t1 = GetTickCount ();
return (t1 - t0);
}
struct thread_param {
// atomic variable
long* variable;
// number of iterations
long iters;
// function to invoke
unsigned long (*fun)(long*, long);
// result of function
unsigned long result;
// thread handle used by main thread
HANDLE thread;
};
extern "C" {
void thread_func(void* p)
{
thread_param* param = (thread_param*)p;
param->result = (param->fun)(param->variable, param->iters);
}
} // extern "C"
unsigned long run_threads(int nthreads, unsigned long (*fun)(long*,
long), long iters)
{
thread_param params[MAX_THREADS];
long thread_var = 0;
int i;
for (i = 0; i < nthreads; ++i) {
params[i].variable = &thread_var;
params[i].result = 0;
params[i].fun = fun;
params[i].iters = iters;
}
int n;
for (n = 0; n < nthreads; ++n) {
params[n].thread = (HANDLE)_beginthread(thread_func, 0,
¶ms[n]);
}
unsigned long thread_time = 0;
for (n = 0; n < nthreads; ++n) {
WaitForSingleObject (params[n].thread, INFINITE);
thread_time += params[n].result;
}
return thread_time;
}
int main(int argc, char* argv[])
{
int nthreads = MIN_THREADS;
if (1 < argc)
nthreads = atoi(argv[1]);
// cap thread count
if (nthreads < MIN_THREADS)
nthreads = MIN_THREADS;
else if (MAX_THREADS < nthreads)
nthreads = MAX_THREADS;
const long ops = 0x7ffffff;
long thread_var;
thread_var = 0;
unsigned long locked_inc_ms = locked_inc (&thread_var, ops);
thread_var = 0;
unsigned long atomic_add_ms = atomic_add (&thread_var, ops);
printf("---------- locked inc ---- atomic_add ---- %d threads\n",
nthreads);
printf("ms %8.u %8.u\n", locked_inc_ms,
atomic_add_ms);
float locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
float atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
printf("ms/op %8.8f %8.8f %.4f%%\n",
locked_inc_ops_p_ms, atomic_add_ops_p_ms,
100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);
// do it with threads
locked_inc_ms = run_threads(nthreads, locked_inc, ops);
atomic_add_ms = run_threads(nthreads, atomic_add, ops);
locked_inc_ms /= nthreads;
atomic_add_ms /= nthreads;
printf("thr ms %8.u %8.u\n", locked_inc_ms,
atomic_add_ms);
locked_inc_ops_p_ms = 1.f * locked_inc_ms / ops;
atomic_add_ops_p_ms = 1.f * atomic_add_ms / ops;
printf("thr ms/op %8.8f %8.8f %.4f%%\n",
locked_inc_ops_p_ms, atomic_add_ops_p_ms,
100.f * (locked_inc_ops_p_ms - atomic_add_ops_p_ms) /
locked_inc_ops_p_ms);
return 0;
}