On 10/3/12 11:10 AM, Martin Sebor wrote:
On 10/03/2012 07:01 AM, Liviu Nicoara wrote:

I am gathering some more measurements along these lines but it's time
consuming. I estimate I will have some ready for review later today or
tomorrow. In the meantime could you please post your kernel, glibc and
compiler versions?

I was just thinking of a few simple loops along the lines of:

   void* thread_func (void*) {
       for (int i = 0; i < N; ++)
           test 1: do some simple stuff inline
           test 2: call a virtual function to do the same stuff
           test 3: lock and unlock a mutex and do the same stuff
   }

Test 1 should be the fastest and test 3 the slowest. This should
hold regardless of what "simple stuff" is (eventually, even when
it's getting numpunct::grouping() data).

That is expected; I attached test case x.cpp and results-x.txt.

I did not find it too interesting in its own, though. The difference between the cached and non-cached data is that in the case of the cached data the copying of the string involves nothing more than a bump in the reference counter, whereas in the non-cached version a string object is constructed anew, and memory gets allocated for its body. Yet, in my measurements, the cached version is the one which shows the worse performance.

So, I extracted the std::string class and simplified it down and put it in another test case. That would be u.cpp and the results are results-u.txt. The results show the same performance trends although the absolute values have skewed. Will get back to this after I digest the results a bit more.

Liviu
-*- mode: org -*-

* iMac, 4x Core i5 , 12S, gcc 4.5.4:

$ nice make u

16, 100000000   1m18.811s       5m12.329s       0m0.263s
8, 100000000    0m39.919s       2m36.198s       0m0.150s
4, 100000000    0m20.449s       1m20.797s       0m0.050s
2, 100000000    0m9.888s        0m19.725s       0m0.005s
1, 100000000    0m2.483s        0m2.480s        0m0.002s

$ nice make CPPOPTS="-DNO_CACHE" u

16, 100000000   0m37.418s       2m27.822s       0m0.872s
8, 100000000    0m18.844s       1m14.607s       0m0.261s
4, 100000000    0m10.165s       0m40.147s       0m0.023s
2, 100000000    0m8.652s        0m17.278s       0m0.003s
1, 100000000    0m8.482s        0m8.473s        0m0.007s

$ nice make CPPOPTS="-DNO_VIRTUAL_CALL" u

16, 100000000   1m2.770s        4m9.307s        0m0.179s
8, 100000000    0m31.890s       2m6.792s        0m0.087s
4, 100000000    0m16.427s       1m5.133s        0m0.039s
2, 100000000    0m8.497s        0m16.981s       0m0.007s
1, 100000000    0m2.291s        0m2.288s        0m0.002s

$ nice make CPPOPTS="-DNO_CACHE -DNO_VIRTUAL_CALL" u

16, 100000000   0m35.838s       2m21.406s       0m0.877s
8, 100000000    0m19.007s       1m14.920s       0m0.255s
4, 100000000    0m10.099s       0m39.504s       0m0.042s
2, 100000000    0m8.599s        0m17.190s       0m0.003s
1, 100000000    0m8.986s        0m8.980s        0m0.005s

* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.5.2

$ nice make u



$ nice make CPPOPTS="-DNO_CACHE" u


$ nice make CPPOPTS="-DNO_VIRTUAL_CALL" u

$ nice make CPPOPTS="-DNO_CACHE -DNO_VIRTUAL_CALL" u

-*- mode: org -*-

* iMac, 4x Core i5 , 12S, gcc 4.5.4:

$ nice make CPPOPTS="-DNO_LOCK -DNO_VIRTUAL_CALL" u

16, 100000000   0m7.864s        0m30.259s       0m0.035s
 8, 100000000   0m4.396s        0m17.034s       0m0.016s
 4, 100000000   0m2.729s        0m10.473s       0m0.011s
 2, 100000000   0m2.481s        0m4.929s        0m0.003s
 1, 100000000   0m2.461s        0m2.455s        0m0.002s

$ nice make CPPOPTS="-DNO_LOCK" u

16, 100000000   0m9.724s        0m37.455s       0m0.043s
 8, 100000000   0m5.559s        0m20.309s       0m0.048s
 4, 100000000   0m3.160s        0m12.213s       0m0.013s
 2, 100000000   0m2.872s        0m5.694s        0m0.004s
 1, 100000000   0m2.845s        0m2.838s        0m0.002s

$ nice make u

16, 100000000   1m3.745s        3m58.570s       0m0.351s
 8, 100000000   0m32.351s       1m55.740s       0m0.203s
 4, 100000000   0m16.852s       1m1.633s        0m0.092s
 2, 100000000   0m8.419s        0m16.699s       0m0.010s
 1, 100000000   0m4.214s        0m4.179s        0m0.005s

* Linux Slackware, 16x AMD Opteron, 12S, gcc 4.7.1

$ nice make CPPOPTS="-DNO_LOCK -DNO_VIRTUAL_CALL" u

16, 100000000   0m4.382s        1m9.896s        0m0.004s
 8, 100000000   0m4.374s        0m34.904s       0m0.002s
 4, 100000000   0m4.368s        0m17.445s       0m0.002s
 2, 100000000   0m4.366s        0m8.720s        0m0.003s
 1, 100000000   0m4.355s        0m4.351s        0m0.001s

$ nice make CPPOPTS="-DNO_LOCK" u

16, 100000000   0m5.415s        1m19.833s       0m0.005s
 8, 100000000   0m4.939s        0m39.438s       0m0.003s
 4, 100000000   0m4.936s        0m19.712s       0m0.001s
 2, 100000000   0m4.930s        0m9.847s        0m0.002s
 1, 100000000   0m4.921s        0m4.917s        0m0.002s

$ nice make u

16, 100000000   1m40.769s       24m17.198s      0m0.006s
 8, 100000000   0m51.702s       6m15.400s       0m0.003s
 4, 100000000   0m26.033s       1m37.651s       0m0.002s
 2, 100000000   0m13.534s       0m25.164s       0m0.003s
 1, 100000000   0m4.964s        0m4.961s        0m0.002s
#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;

////////////////////////////////////////////////////////////////////////

struct facet
{
private:

#if !defined (NO_CACHE)

    //
    // Mimic the ref-counting in std::string where the copy ctor and
    // the dtor yank the counter up and down, respectively. This is
    // what's happening when copying a cached std::string object.
    //

    char* _C_copy () const {
        facet* self = const_cast< facet* > (this);
        __rw::__rw_atomic_preincrement (self->_C_refs, false);
        return const_cast< char* > (self->_C_data);
    }

    void _C_discard (char*) const {
        facet* self = const_cast< facet* > (this);
        __rw::__rw_atomic_predecrement (self->_C_refs, false);
    }

#else

    //
    // Mimic the construction of an std::string object, anew:
    // allocation of space, copying of content over. This is what's
    // happening when bypassing the cache and creating std::string
    // objects directly out of locale data.
    //

    char* _C_copy () const {
        size_t n = strlen (_C_data) + 1;
        char* p = reinterpret_cast< char* > (::operator new (n));
        memcpy (p, _C_data, n);
        return p;
    }

    void _C_discard (char* p) const {
        ::operator delete (p);
    }

#endif // NO_CACHE

public:

    facet (char const* s) : _C_data (s) { }

    char* get () const {
#if !defined (NO_VIRTUAL_CALL)
        return do_get ();
#else
        return _C_copy ();
#endif // NO_VIRTUAL_CALL
    }

    void discard (char* p) const {
        return _C_discard (p);
    }
    
protected:

    virtual char* do_get () const;

private:

    char const* _C_data;
    unsigned long _C_refs;
    __rw::__rw_mutex _C_mutex;
};

/* virtual */ char* 
facet::do_get () const
{
    return _C_copy ();
}

extern "C" {

static void* 
f (void* pv)
{
    facet& fac = *reinterpret_cast< facet* > (pv);

    unsigned long n = 0;
    while (pwait) ;
    
    for (int i = 0; i < nloops; ++i) {
        char* s = fac.get ();
        n += strlen (s);
        fac.discard (s);
    }

    return (void*)n;
}

} // extern "C"

int
main (int argc, char** argv)
{
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);
        break;
    }

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%ld, %ld", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    facet fac ("\3\3");
    
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, &fac))
            exit (-1);
    }

    sleep (1);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);
    }

    return 0;
}
#include <iostream>
#include <locale>

#include <cstdio>
#include <cstdlib>
#include <cstring>

#include <pthread.h>
#include <unistd.h>

#define MAX_THREADS 128

static long nloops = 100000000, nthreads = 16;
static bool volatile pwait = true;

////////////////////////////////////////////////////////////////////////

struct S
{
    S (char const* s) : refs (), value (s) { }

    unsigned long ref () {
        return __rw::__rw_atomic_preincrement (refs, false);
    }

    unsigned long unref () {
        return __rw::__rw_atomic_predecrement (refs, false);
    }

    char const* get () {

#if !defined (NO_LOCK)
        ref ();
#endif // NO_LOCK

#if !defined (NO_VIRTUAL_CALL)
        return do_get ();
#else
        return value;
#endif // NO_VIRTUAL_CALL

#if !defined (NO_LOCK)
        unref ();
#endif // NO_LOCK
    }

    virtual char const* do_get () const;

    unsigned long refs;
    char const* value;
};

/* virtual */ char const* 
S::do_get () const
{
    return this->value;
}

extern "C" {

static void* 
f (void* pv)
{
    S& s = *reinterpret_cast< S* > (pv);

    unsigned long n = 0;
    char const* p = 0;

    while (pwait) ;
    
    for (int i = 0; i < nloops; ++i) {

        p = s.get ();
        n += strlen (p);

        for (; p [0]; ++p)
            n += p [0];
    }

    return (void*)n;
}

} // extern "C"

int
main (int argc, char** argv)
{
    switch (argc) {
    case 3:
        nloops = atol (argv [2]);
    case 2:
        nthreads = atol (argv [1]);
        break;
    }

    pthread_t tid [MAX_THREADS] = { 0 };

    if (nthreads > MAX_THREADS)
        nthreads = MAX_THREADS;

    printf ("%ld, %ld", nthreads, nloops);

    pthread_setconcurrency (nthreads);

    S s ("01234567890123456789");
    
    for (int i = 0; i < nthreads; ++i) {
        if (pthread_create (tid + i, 0, f, &s))
            exit (-1);
    }

    sleep (1);
    pwait = false;

    for (int i = 0; i < nthreads; ++i) {
        if (tid [i])
            pthread_join (tid [i], 0);
    }

    return 0;
}

Reply via email to