I attached a program showing the slowdown.  It spawns threads that call
pthread_getspecific in a tight loop.  On Linux the wall clock time is
essentially constant for number of threads up to number of processors.
On OpenBSD 5.3 wall clock time increases approximately linearly with
number of processors.

First argument is number of threads.  Second argument is number
of loop iterations.  The default count on my system gives about
0.4 seconds per active thread.  My system is:

cpu0: AMD Phenom(tm) II X4 955 Processor, 3211.18 MHz
cpu1: AMD Phenom(tm) II X4 955 Processor, 3210.78 MHz
cpu2: AMD Phenom(tm) II X4 955 Processor, 3210.78 MHz
cpu3: AMD Phenom(tm) II X4 955 Processor, 3210.78 MHz

> We haven't done a lot of work to optimize performance except in
> response to specific issues. Sounds like you found one. Would you
> mind providing a test case? I just want to make sure we fix the
> right thing.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <pthread.h>

const int verbose = 0;

struct data {
  pthread_key_t key;
  long count;
};

void fn(pthread_key_t key, long count)
{
  long i;

  pthread_setspecific(key, "hi!");
  for (i = 0; i < count; i++) {
    void *v = pthread_getspecific(key);
    assert(v);
  }
  return;
}

void *tstart(void *x)
{
  struct data *d = (struct data *)x;
  if (verbose)
    fputs("Starting thread\n", stdout);
  fn(d->key, d->count);
  if (verbose)
    fputs("Stopping thread\n", stdout);
  return 0;
}

#define MAXTHREAD 10

void spawn(pthread_key_t key, int nthreads, long count)
{
  pthread_t *threads;
  struct data d;
  int i, rv;

  threads = malloc(nthreads * sizeof (pthread_t));
  assert(threads);

  d.key = key;
  d.count = count;

  for (i = 0; i < nthreads; i++) {
    rv = pthread_create(&threads[i], NULL, tstart, &d);
    assert(!rv);
  }

  for (i = 0; i < nthreads; i++) {
    void *rptr = "";
    rv = pthread_join(threads[i], &rptr);
    assert(!rv);
    assert(!rptr);
  }
}

int main(int argc, char *argv[])
{
  int nthreads = 3;
  long niter = 20000000;
  pthread_key_t key;
  int rv;
  struct timeval t0, t1;
  long dts;
  int dtu;

  if (argc > 1) {
    nthreads = atoi(argv[1]);
    if (nthreads < 1 || nthreads > 100) {
      fputs("Thread count must be integer in range 1..100\n", stderr);
      return 1;
    }
  }
  if (argc > 2) {
    niter = atol(argv[2]);
    if (niter < 1) {
      fputs("Iteration count must be a positive integer\n", stderr);
      return 1;
    }
  }

  rv = pthread_key_create(&key, 0);
  assert(!rv);

  rv = gettimeofday(&t0, 0);
  assert(!rv);

  spawn(key, nthreads, niter);

  rv = gettimeofday(&t1, 0);
  assert(!rv);

  rv = pthread_key_delete(key);
  assert(!rv);

  dts = (long)t1.tv_sec - (long)t0.tv_sec;
  dtu = (int)t1.tv_usec - (int)t0.tv_usec;
  if (dtu < 0) {
    dtu += 1000000;
    dts -= 1;
  }
  assert(dts >= 0);
  assert(dtu >= 0);

  printf("Time (%d threads, %ld iterations) = %ld.%03u\n",
	 nthreads, niter, dts, dtu / 1000);

  return 0;
}

Reply via email to