On 10/03/12 11:10, Martin Sebor wrote:
[...]
I was just thinking of a few simple loops along the lines of:
tl;dr: I consider the results of the multi-threaded performance tests (12S,
Intel/AMD multicores) as coming from heavy contention in copying of
reference-counted std::string objects.
The long version: I went back to the original test case (attached, t.cpp). As a
reminder, the program is a simplification of the library numpunct facet
creation process, which can be compiled either with data caching enabled or
disabled.
I ran the perf tool (https://perf.wiki.kernel.org/index.php/Main_Page) on the
program, and I noticed marked differences in the number of instructions and
cycles (an order of magnitude) and also in the number of CPU-migrations and
context-switches (way larger for the cached version):
Performance counter stats for './t':
121168.026301 task-clock-msecs # 3.778 CPUs
6133377 context-switches # 0.051 M/sec
35457 CPU-migrations # 0.000 M/sec
298 page-faults # 0.000 M/sec
338006437277 cycles # 2789.568 M/sec
136554855096 instructions # 0.404 IPC
31298346444 branches # 258.305 M/sec
2305761092 branch-misses # 7.367 %
1592157186 cache-references # 13.140 M/sec
1729755 cache-misses # 0.014 M/sec
32.070381363 seconds time elapsed
vs.
Performance counter stats for './t':
16194.235282 task-clock-msecs # 3.883 CPUs
7491 context-switches # 0.000 M/sec
47 CPU-migrations # 0.000 M/sec
311 page-faults # 0.000 M/sec
45210281987 cycles # 2791.752 M/sec
76784904688 instructions # 1.698 IPC
18891711522 branches # 1166.570 M/sec
3162251552 branch-misses # 16.739 %
3332839 cache-references # 0.206 M/sec
119424 cache-misses # 0.007 M/sec
4.170904279 seconds time elapsed
Then, I have run more detailed reports on the runs. I noticed that the report
for the cached version contains what appears to be thread synchronization
overhead, whereas the non-cached version doesn't:
# Events: 134K cycles
#
# Overhead Command Shared Object Symbol
# ........ ....... .................. ...................................
#
50.69% t [kernel.kallsyms] [k] _raw_spin_lock
5.46% t libpthread-2.13.so [.] pthread_mutex_lock
4.16% t libpthread-2.13.so [.] __pthread_mutex_unlock
3.65% t libpthread-2.13.so [.] __lll_lock_wait
3.35% t [kernel.kallsyms] [k] copy_user_generic_string
3.09% t 40395b [.] 40395b
2.91% t [kernel.kallsyms] [k] hash_futex
2.87% t [kernel.kallsyms] [k] futex_wake
2.67% t t [.] f
2.37% t [kernel.kallsyms] [k] futex_wait_setup
1.61% t [kernel.kallsyms] [k] system_call
1.00% t [kernel.kallsyms] [k] system_call_after_swapgs
0.98% t [kernel.kallsyms] [k] __switch_to
0.96% t libpthread-2.13.so [.] __lll_unlock_wake
0.95% t [kernel.kallsyms] [k] get_futex_key
0.92% t libc-2.13.so [.] __strlen_sse42
[...]
vs.
# Events: 15K cycles
#
# Overhead Command Shared Object
Symbol
# ........ ....... ..................
..............................................................................................................................
#
49.55% :29571 7f1ed3830a86 [.] 7f1ed3830a86
8.67% t libc-2.13.so [.] __malloc
8.03% t libc-2.13.so [.] cfree
4.88% t t [.] std::basic_string<char, std::char_traits<char>,
std::allocator<char> >::basic_string(char const*, std::allocator<char> const&)
4.67% t libc-2.13.so [.] __strlen_sse42
4.14% t libpthread-2.13.so [.] __pthread_mutex_unlock
3.62% t libpthread-2.13.so [.] pthread_mutex_lock
2.64% t t [.] f
2.40% t libc-2.13.so [.] _int_malloc
2.10% t libc-2.13.so [.] _int_free
1.53% t libpthread-2.13.so [.] __pthread_mutex_init_internal
1.33% t t [.] operator new(unsigned long)
1.08% t t [.] X::numpunct::do_grouping() const
1.04% t libc-2.13.so [.] __memcpy_sse2
0.89% t t [.] X::get_data(X::facet*)
0.51% t t [.] __rw::__rw_allocate(unsigned long,
int)
0.45% t t [.] strlen@plt
0.33% t t [.] __rw::__rw_deallocate(void*,
unsigned long, int)
0.32% t libpthread-2.13.so [.] pthread_mutex_destroy
0.21% t t [.] pthread_mutex_unlock@plt
0.21% t t [.] pthread_mutex_init@plt
0.18% t t [.] pthread_mutex_destroy@plt
0.17% t t [.] operator delete(void*)
0.17% t t [.] malloc@plt
0.16% t t [.] memcpy@plt
0.15% :29571 [kernel.kallsyms] [k] hpet_msi_next_event
0.12% t libc-2.13.so [.] __strrchr_sse2
0.12% t t [.] pthread_mutex_lock@plt
0.11% t t [.] free@plt
[...]
Now, if you remember from previous measurements, the performance of the two
versions of the program was reversed for the case of one thread, i.e.,
one-threaded programs performed better in the cached version. Here is the
report for a run with one thread, the cached version (running one thread in
1000M loops, otherwise it does not register anything):
# Events: 36K cycles
#
# Overhead Command Shared Object Symbol
# ........ ....... .................. ........................
#
37.68% t libpthread-2.13.so [.] __pthread_mutex_unlock
31.04% t libpthread-2.13.so [.] pthread_mutex_lock
11.87% t t [.] f
9.64% :27362 7fd94e2ebd6d [.] 7fd94e2ebd6d
3.21% t libc-2.13.so [.] __strlen_sse42
2.38% t t [.] pthread_mutex_unlock@plt
2.27% t t [.] pthread_mutex_lock@plt
1.61% t t [.] strlen@plt
[...]
The one-threaded run for the non-cached version yields a report identical with
the multi-threaded run.
There are two things which I take from this round of measurements: first, that
performance suffers on multicores in heavy contention, possibly as a direct
result of std::string object copying and I'll follow up with direct
measurements. Second, that perhaps we should not focus on this particular use
model for the numpunct facet, because it may not be very representative for
`normal' usage.
HTH
Liviu
* With caching
** Stats counters
*** Contended run
16, 10000000
Performance counter stats for './t':
121168.026301 task-clock-msecs # 3.778 CPUs
6133377 context-switches # 0.051 M/sec
35457 CPU-migrations # 0.000 M/sec
298 page-faults # 0.000 M/sec
338006437277 cycles # 2789.568 M/sec
136554855096 instructions # 0.404 IPC
31298346444 branches # 258.305 M/sec
2305761092 branch-misses # 7.367 %
1592157186 cache-references # 13.140 M/sec
1729755 cache-misses # 0.014 M/sec
32.070381363 seconds time elapsed
** Report
# Events: 134K cycles
#
# Overhead Command Shared Object Symbol
# ........ ....... .................. ...................................
#
50.69% t [kernel.kallsyms] [k] _raw_spin_lock
5.46% t libpthread-2.13.so [.] pthread_mutex_lock
4.16% t libpthread-2.13.so [.] __pthread_mutex_unlock
3.65% t libpthread-2.13.so [.] __lll_lock_wait
3.35% t [kernel.kallsyms] [k] copy_user_generic_string
3.09% t 40395b [.] 40395b
2.91% t [kernel.kallsyms] [k] hash_futex
2.87% t [kernel.kallsyms] [k] futex_wake
2.67% t t [.] f
2.37% t [kernel.kallsyms] [k] futex_wait_setup
1.61% t [kernel.kallsyms] [k] system_call
1.00% t [kernel.kallsyms] [k] system_call_after_swapgs
0.98% t [kernel.kallsyms] [k] __switch_to
0.96% t libpthread-2.13.so [.] __lll_unlock_wake
0.95% t [kernel.kallsyms] [k] get_futex_key
0.92% t libc-2.13.so [.] __strlen_sse42
[...]
*** Uncontended run
1, 100000000
# Events: 3K cycles
#
# Overhead Command Shared Object Symbol
# ........ ....... ................. .....................
#
99.35% :15870 7f082cdda456 [.] 7f082cdda456
0.46% t [kernel.kallsyms] [k] recalc_sigpending_tsk
0.19% :15870 [kernel.kallsyms] [k] hpet_msi_next_event
0.01% t [kernel.kallsyms] [k] __free_pipe_info
0.00% t [kernel.kallsyms] [k] intel_pmu_enable_all
* Without caching
** Stats counters
16, 10000000
Performance counter stats for './t':
16194.235282 task-clock-msecs # 3.883 CPUs
7491 context-switches # 0.000 M/sec
47 CPU-migrations # 0.000 M/sec
311 page-faults # 0.000 M/sec
45210281987 cycles # 2791.752 M/sec
76784904688 instructions # 1.698 IPC
18891711522 branches # 1166.570 M/sec
3162251552 branch-misses # 16.739 %
3332839 cache-references # 0.206 M/sec
119424 cache-misses # 0.007 M/sec
4.170904279 seconds time elapsed
** Report
# Events: 15K cycles
#
# Overhead Command Shared Object
Symbol
# ........ ....... ..................
..............................................................................................................................
#
49.55% :29571 7f1ed3830a86 [.] 7f1ed3830a86
8.67% t libc-2.13.so [.] __malloc
8.03% t libc-2.13.so [.] cfree
4.88% t t [.] std::basic_string<char,
std::char_traits<char>, std::allocator<char> >::basic_string(char const*,
std::allocator<char> const&)
4.67% t libc-2.13.so [.] __strlen_sse42
4.14% t libpthread-2.13.so [.] __pthread_mutex_unlock
3.62% t libpthread-2.13.so [.] pthread_mutex_lock
2.64% t t [.] f
2.40% t libc-2.13.so [.] _int_malloc
2.10% t libc-2.13.so [.] _int_free
1.53% t libpthread-2.13.so [.] __pthread_mutex_init_internal
1.33% t t [.] operator new(unsigned long)
1.08% t t [.] X::numpunct::do_grouping() const
1.04% t libc-2.13.so [.] __memcpy_sse2
0.89% t t [.] X::get_data(X::facet*)
0.51% t t [.] __rw::__rw_allocate(unsigned long,
int)
0.45% t t [.] strlen@plt
0.33% t t [.] __rw::__rw_deallocate(void*,
unsigned long, int)
0.32% t libpthread-2.13.so [.] pthread_mutex_destroy
0.21% t t [.] pthread_mutex_unlock@plt
0.21% t t [.] pthread_mutex_init@plt
0.18% t t [.] pthread_mutex_destroy@plt
0.17% t t [.] operator delete(void*)
0.17% t t [.] malloc@plt
0.16% t t [.] memcpy@plt
0.15% :29571 [kernel.kallsyms] [k] hpet_msi_next_event
0.12% t libc-2.13.so [.] __strrchr_sse2
0.12% t t [.] pthread_mutex_lock@plt
0.11% t t [.] free@plt
[...]
#include <iostream>
#include <locale>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <pthread.h>
#include <unistd.h>
#define MAX_THREADS 128
static long nloops = 10000000, nthreads = 16;
static bool volatile pwait = true;
////////////////////////////////////////////////////////////////////////
namespace X {
struct guard
{
guard (pthread_mutex_t* ptr) : lock_ (ptr) {
pthread_mutex_lock (lock_);
}
~guard () {
pthread_mutex_unlock (lock_);
}
private:
guard (guard const&);
guard& operator= (guard const&);
private:
pthread_mutex_t* lock_;
};
struct facet
{
facet () {
pthread_mutex_init (&_C_mutex, 0);
}
void* _C_data () {
return _C_impsize ? _C_impdata : _C_get_data ();
}
void* _C_get_data ();
size_t _C_impsize;
void* _C_impdata;
pthread_mutex_t _C_mutex;
};
void* facet::_C_get_data ()
{
if (_C_impsize)
return _C_impdata;
guard g (&_C_mutex);
if (_C_impsize)
return _C_impdata;
#if !defined (NO_STDCXX_LOCALES)
_C_impdata = (void*)"\3\3";
#endif // NO_STDCXX_LOCALES
_C_impsize = (size_t)(-1);
return _C_impdata;
}
void* get_data (facet*);
struct numpunct : facet
{
numpunct () : _C_flags () { }
virtual std::string do_grouping () const;
std::string grouping () const {
numpunct* const __self = const_cast< numpunct* > (this);
#if !defined (NO_CACHE)
if (!(_C_flags & 1)) {
__self->_C_flags |= 1;
__self->_C_grouping = do_grouping ();
}
return _C_grouping;
#else
return do_grouping ();
#endif // NO_CACHE
}
private:
int _C_flags;
std::string _C_grouping;
};
std::string numpunct::do_grouping () const
{
return (const char*)get_data (const_cast<numpunct*>(this));
}
void* get_data (facet* pfacet)
{
void* pdata = pfacet->_C_data ();
if (pdata)
return pdata;
// __rw_setlocale loc (...); <- other mutex
if (pfacet->_C_data ())
return get_data (pfacet);
pfacet->_C_impdata = const_cast<char*>("\4\4");
pfacet->_C_impsize = (size_t)(-1);
return 0;
}
} // namespace X
////////////////////////////////////////////////////////////////////////
typedef X::numpunct Numpunct;
extern "C" {
static void*
f (void* pv)
{
Numpunct const& fac = *reinterpret_cast< Numpunct* > (pv);
volatile unsigned long n = 0;
while (pwait) ;
for (int i = 0; i < nloops; ++i) {
const std::string grouping = fac.grouping ();
n += strlen (grouping.c_str ());
}
return (void*)n;
}
} // extern "C"
int
main (int argc, char** argv)
{
switch (argc) {
case 3:
nloops = atol (argv [2]);
case 2:
nthreads = atol (argv [1]);
break;
}
pthread_t tid [MAX_THREADS] = { 0 };
if (nthreads > MAX_THREADS)
nthreads = MAX_THREADS;
printf ("%ld, %ld\n", nthreads, nloops);
pthread_setconcurrency (nthreads);
Numpunct fac;
fac.grouping ();
for (int i = 0; i < nthreads; ++i) {
if (pthread_create (tid + i, 0, f, const_cast<Numpunct*> (&fac)))
exit (-1);
}
usleep (50);
pwait = false;
for (int i = 0; i < nthreads; ++i) {
if (tid [i])
pthread_join (tid [i], 0);
}
return 0;
}