On 10/3/12 11:10 AM, Martin Sebor wrote:
On 10/03/2012 07:01 AM, Liviu Nicoara wrote:

I am gathering some more measurements along these lines but it's time
consuming. I estimate I will have some ready for review later today or
tomorrow. In the meantime could you please post your kernel, glibc and
compiler versions?

I was just thinking of a few simple loops along the lines of:

   void* thread_func (void*) {
       for (int i = 0; i < N; ++)
           test 1: do some simple stuff inline
           test 2: call a virtual function to do the same stuff
           test 3: lock and unlock a mutex and do the same stuff

Test 1 should be the fastest and test 3 the slowest. This should
hold regardless of what "simple stuff" is (eventually, even when
it's getting numpunct::grouping() data).

That is expected; I attached test case x.cpp and results-x.txt.

I did not find it too interesting in its own, though. The difference between the cached and non-cached data is that in the case of the cached data the copying of the string involves nothing more than a bump in the reference counter, whereas in the non-cached version a string object is constructed anew, and memory gets allocated for its body. Yet, in my measurements, the cached version is the one which shows the worse performance.

So, I extracted the std::string class and simplified it down and put it in another test case. That would be u.cpp and the results are results-u.txt. The results show the same performance trends although the absolute values have skewed. Will get back to this after I digest the results a bit more.

-*- mode: org -*-

* iMac, 4x Core i5 , 12S, gcc 4.5.4:

$ nice make u

16, 100000000   1m18.811s       5m12.329s       0m0.263s
8, 100000000    0m39.919s       2m36.198s       0m0.150s
4, 100000000    0m20.449s       1m20.797s       0m0.050s
2, 100000000    0m9.888s        0m19.725s       0m0.005s
1, 100000000    0m2.483s        0m2.480s        0m0.002s

$ nice make CPPOPTS="-DNO_CACHE" u

16, 100000000   0m37.418s       2m27.822s       0m0.872s
8, 100000000    0m18.844s       1m14.607s       0m0.261s
4, 100000000    0m10.165s       0m40.147s       0m0.023s
2, 100000000    0m8.652s        0m17.278s       0m0.003s
1, 100000000    0m8.482s        0m8.473s        0m0.007s


16, 100000000   1m2.770s        4m9.307s        0m0.179s
8, 100000000    0m31.890s       2m6.792s        0m0.087s
4, 100000000    0m16.427s       1m5.133s        0m0.039s
2, 100000000    0m8.497s        0m16.981s       0m0.007s
1, 100000000    0m2.291s        0m2.288s        0m0.002s


16, 100000000   0m35.838s       2m21.406s       0m0.877s
8, 100000000    0m19.007s       1m14.920s       0m0.255s
4, 100000000    0m10.099s       0m39.504s       0m0.042s
2, 100000000    0m8.599s        0m17.190s       0m0.003s
1, 100000000    0m8.986s        0m8.980s        0m0.005s

* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.5.2

$ nice make u

$ nice make CPPOPTS="-DNO_CACHE" u



-*- mode: org -*-

* iMac, 4x Core i5 , 12S, gcc 4.5.4:


16, 100000000   0m7.864s        0m30.259s       0m0.035s
 8, 100000000   0m4.396s        0m17.034s       0m0.016s
 4, 100000000   0m2.729s        0m10.473s       0m0.011s
 2, 100000000   0m2.481s        0m4.929s        0m0.003s
 1, 100000000   0m2.461s        0m2.455s        0m0.002s

$ nice make CPPOPTS="-DNO_LOCK" u

16, 100000000   0m9.724s        0m37.455s       0m0.043s
 8, 100000000   0m5.559s        0m20.309s       0m0.048s
 4, 100000000   0m3.160s        0m12.213s       0m0.013s
 2, 100000000   0m2.872s        0m5.694s        0m0.004s
 1, 100000000   0m2.845s        0m2.838s        0m0.002s

$ nice make u

16, 100000000   1m3.745s        3m58.570s       0m0.351s
 8, 100000000   0m32.351s       1m55.740s       0m0.203s
 4, 100000000   0m16.852s       1m1.633s        0m0.092s
 2, 100000000   0m8.419s        0m16.699s       0m0.010s
 1, 100000000   0m4.214s        0m4.179s        0m0.005s

* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.7.1


16, 100000000   0m4.382s        1m9.896s        0m0.004s
 8, 100000000   0m4.374s        0m34.904s       0m0.002s
 4, 100000000   0m4.368s        0m17.445s       0m0.002s
 2, 100000000   0m4.366s        0m8.720s        0m0.003s
 1, 100000000   0m4.355s        0m4.351s        0m0.001s

$ nice make CPPOPTS="-DNO_LOCK" u

16, 100000000   0m5.415s        1m19.833s       0m0.005s
 8, 100000000   0m4.939s        0m39.438s       0m0.003s
 4, 100000000   0m4.936s        0m19.712s       0m0.001s
 2, 100000000   0m4.930s        0m9.847s        0m0.002s
 1, 100000000   0m4.921s        0m4.917s        0m0.002s

$ nice make u

16, 100000000   1m40.769s       24m17.198s      0m0.006s
 8, 100000000   0m51.702s       6m15.400s       0m0.003s
 4, 100000000   0m26.033s       1m37.651s       0m0.002s
 2, 100000000   0m13.534s       0m25.164s       0m0.003s
 1, 100000000   0m4.964s        0m4.961s        0m0.002s
#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;


struct facet

#if !defined (NO_CACHE)

    // Mimic the ref-counting in std::string where the copy ctor and
    // the dtor yank the counter up and down, respectively. This is
    // what's happening when copying a cached std::string object.

    char* _C_copy () const {
        facet* self = const_cast< facet* > (this);
        __rw::__rw_atomic_preincrement (self->_C_refs, false);
        return const_cast< char* > (self->_C_data);

    void _C_discard (char*) const {
        facet* self = const_cast< facet* > (this);
        __rw::__rw_atomic_predecrement (self->_C_refs, false);


    // Mimic the construction of an std::string object, anew:
    // allocation of space, copying of content over. This is what's
    // happening when bypassing the cache and creating std::string
    // objects directly out of locale data.

    char* _C_copy () const {
        size_t n = strlen (_C_data) + 1;
        char* p = reinterpret_cast< char* > (::operator new (n));
        memcpy (p, _C_data, n);
        return p;

    void _C_discard (char* p) const {
        ::operator delete (p);

#endif // NO_CACHE


    facet (char const* s) : _C_data (s) { }

    char* get () const {
#if !defined (NO_VIRTUAL_CALL)
        return do_get ();
        return _C_copy ();

    void discard (char* p) const {
        return _C_discard (p);

    virtual char* do_get () const;


    char const* _C_data;
    unsigned long _C_refs;
    __rw::__rw_mutex _C_mutex;

/* virtual */ char* 
facet::do_get () const
    return _C_copy ();

extern "C" {

static void* 
f (void* pv)
    facet& fac = *reinterpret_cast< facet* > (pv);

    unsigned long n = 0;
    while (pwait) ;
    for (int i = 0; i < nloops; ++i) {
        char* s = fac.get ();
        n += strlen (s);
        fac.discard (s);

    return (void*)n;

} // extern "C"

main (int argc, char** argv)
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%ld, %ld", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    facet fac ("\3\3");
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, &fac))
            exit (-1);

    sleep (1);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);

    return 0;
#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;


struct S
    S (char const* s) : refs (), value (s) { }

    unsigned long ref () {
        return __rw::__rw_atomic_preincrement (refs, false);

    unsigned long unref () {
        return __rw::__rw_atomic_predecrement (refs, false);

    char const* get () {

#if !defined (NO_LOCK)
        ref ();
#endif // NO_LOCK

#if !defined (NO_VIRTUAL_CALL)
        return do_get ();
        return value;

#if !defined (NO_LOCK)
        unref ();
#endif // NO_LOCK

    virtual char const* do_get () const;

    unsigned long refs;
    char const* value;

/* virtual */ char const* 
S::do_get () const
    return this->value;

extern "C" {

static void* 
f (void* pv)
    S& s = *reinterpret_cast< S* > (pv);

    unsigned long n = 0;
    char const* p = 0;

    while (pwait) ;
    for (int i = 0; i < nloops; ++i) {

        p = s.get ();
        n += strlen (p);

        for (; p [0]; ++p)
            n += p [0];

    return (void*)n;

} // extern "C"

main (int argc, char** argv)
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%ld, %ld", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    S s ("01234567890123456789");
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, &s))
            exit (-1);

    sleep (1);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);

    return 0;

Reply via email to