Re: [lock-free] Experimental Read/Write Mutex..

Dmitry Vyukov Sat, 16 Feb 2019 23:24:46 -0800

On Sun, Feb 17, 2019 at 12:34 AM Chris M. Thomasson <cris...@charter.net> wrote:
>
> Fwiw, here is a simple benchmark for an older read/write algorithm I invented:
>
>
> https://pastebin.com/raw/xCBHY9qd
>
>
> It should compile fine on MSVC 2017 and GCC with -std=c++17. There is a macro 
> in the code called CT_TEST_FAST_MUTEX. If it is defined then the code will 
> test my algorithm. If not defined then it will test against 
> std::shared_mutex. I am wondering if anybody can compile and run the code? 
> Test it out on as many operating systems as you can. Can you please show the 
> output?
>
>
> Thanks everybody. :^)
>
> Here is the code:
>
> /* Simple, crude read/write mutex test
>    by: Chris M. Thomasson
> __________________________________________*/
>
>
>
> #include <thread>
> #include <atomic>
> #include <shared_mutex>
> #include <condition_variable>
> #include <iostream>
> #include <functional>
> #include <cassert>
> #include <cstdlib>
> #include <ctime>
>
>
> #define THREADS 16UL
> #define ITERS 10000000UL
> #define COUNT (THREADS * ITERS)
>
>
> // undefine to test std::shared_mutex
> #define CT_TEST_FAST_MUTEX 1
>
>
> // bare bones mutex/condvar based semaphore
> struct ct_slow_semaphore
> {
>     unsigned long m_state;
>     std::mutex m_mutex;
>     std::condition_variable m_cond;
>
>     ct_slow_semaphore(unsigned long state) : m_state(state) {}
>
>     void inc()
>     {
>         {
>             std::unique_lock<std::mutex> lock(m_mutex);
>             ++m_state;
>         }
>
>         m_cond.notify_one();
>     }
>
>     void add(unsigned long addend)
>     {
>         {
>             std::unique_lock<std::mutex> lock(m_mutex);
>             m_state += addend;
>         }
>
>         m_cond.notify_all();
>     }
>
>     void dec()
>     {
>         std::unique_lock<std::mutex> lock(m_mutex);
>         while (m_state == 0) m_cond.wait(lock);
>         --m_state;
>     }
> };
>
>
>
>
> // bin-sema
> struct ct_auto_reset_event
> {
>     bool m_state;
>     std::mutex m_mutex;
>     std::condition_variable m_cond;
>
>     ct_auto_reset_event() : m_state(false) {}
>
>     void signal()
>     {
>         std::unique_lock<std::mutex> lock(m_mutex);
>         m_state = true;
>         m_cond.notify_one();
>     }
>
>     void wait()
>     {
>         std::unique_lock<std::mutex> lock(m_mutex);
>         while (m_state == false) m_cond.wait(lock);
>         m_state = false; // auto-reset
>     }
> };
>
>
> // just a layer over an auto-reset event
> struct ct_fast_mutex
> {
>     std::atomic<unsigned int> m_state;
>     ct_auto_reset_event m_waitset;
>
>     ct_fast_mutex() : m_state(0) {}
>
>     void lock()
>     {
>         if (m_state.exchange(1, std::memory_order_acquire))
>         {
>             while (m_state.exchange(2, std::memory_order_acquire))
>             {
>                 m_waitset.wait();
>             }
>         }
>     }
>
>     void unlock()
>     {
>         if (m_state.exchange(0, std::memory_order_release) == 2)
>         {
>             m_waitset.signal();
>         }
>     }
> };
>
>
>
> // Chris M. Thomassons Experimental Read/Write Mutex
> // Yeah, it is pretty damn fat wrt the state, however
> // it has some interesting properties...
> // The state can be compressed a bit...
> // btw, it has no loops...
> // Take a look at the lock_shared and unlock_shared functions
>
> #define RWMUTEX_COUNT_MAX LONG_MAX
>
> struct ct_rwmutex
> {
>     // shared state
>     std::atomic<long> m_wrstate;
>     std::atomic<long> m_count;
>     std::atomic<long> m_rdwake;
>
>     ct_slow_semaphore m_rdwset;
>     ct_slow_semaphore m_wrwset;
>     ct_fast_mutex m_wrlock;
>
>
>     ct_rwmutex() :
>         m_wrstate(1),
>         m_count(RWMUTEX_COUNT_MAX),
>         m_rdwake(0),
>         m_rdwset(0),
>         m_wrwset(0) {
>     }
>
>
>     // READ, pretty slim...
>     void lock_shared()
>     {
>         if (m_count.fetch_add(-1, std::memory_order_acquire) < 1)
>         {
>             m_rdwset.dec();
>         }
>     }
>
>     void unlock_shared()
>     {
>         if (m_count.fetch_add(1, std::memory_order_release) < 0)
>         {
>             if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1)
>             {
>                 m_wrwset.inc();
>             }
>         }
>     }
>
>
>     // WRITE, more hefty
>     void lock()
>     {
>         m_wrlock.lock();
>
>         long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, 
> std::memory_order_acquire);
>
>         if (count < RWMUTEX_COUNT_MAX)
>         {
>             long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, 
> std::memory_order_acquire);
>
>             if (rdwake + RWMUTEX_COUNT_MAX - count)
>             {
>                 m_wrwset.dec();
>             }
>         }
>     }
>
>     // write unlock
>     void unlock()
>     {
>         long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, 
> std::memory_order_release);
>
>         if (count < 0)
>         {
>             m_rdwset.add(-count);
>         }
>
>         m_wrlock.unlock();
>     }
> };
>
>
> struct ct_shared
> {
>     std::atomic<unsigned long> m_state;
>
> #if defined (CT_TEST_FAST_MUTEX)
>     ct_rwmutex m_std_rwmutex;
> #else
>     std::shared_mutex m_std_rwmutex;
> #endif
>
>     ct_shared() : m_state(0) {}
> };
>
>
> void ct_thread(ct_shared& shared, std::size_t index)
> {
>     for (unsigned int i = 0; i < ITERS; ++i)
>     {
>
>         shared.m_std_rwmutex.lock();
>         if (i % 256 == 0) std::this_thread::yield();
>         shared.m_state += 1;
>         shared.m_std_rwmutex.unlock();
>
>
>         shared.m_std_rwmutex.lock_shared();
>         if (i % 512 == 0) std::this_thread::yield();
>         //shared.m_state += 1;
>         shared.m_std_rwmutex.unlock_shared();
>
>     }
> }
>
>
> int main()
> {
>     ct_shared shared;
>
>     std::cout << "Testing: ";
>
> #if defined (CT_TEST_FAST_MUTEX)
>     std::cout << "Chris M. Thomasson's Experimental Read/Write Mutex\n\n";
> #else
>     std::cout << "std::shared_mutex\n\n";
> #endif
>
>     {
>         std::thread threads[THREADS];
>
>         std::clock_t start = std::clock();
>
>         for (std::size_t i = 0; i < THREADS; ++i)
>         {
>             threads[i] = std::thread(ct_thread, std::ref(shared), i);
>         }
>
>         for (std::size_t i = 0; i < THREADS; ++i)
>         {
>             threads[i].join();
>         }
>
>         std::clock_t diff = clock() - start;
>
>         unsigned long msec = diff * 1000 / CLOCKS_PER_SEC;
>
>         std::cout << "msec = " << msec << "\n";
>     }
>
>     std::cout << "shared.m_state = " << shared.m_state << "\n";
>     std::cout << "\n\nFin!\n";
>
>     assert(shared.m_state == COUNT);
>
>     return 0;
> }


Hey Chris!

Here are my results on i7-8650U (4 HT cores) on Debian 4.19.16-1 gcc 7.3.0

std::shared_mutex
msec = 56118
msec = 49149
msec = 69416
msec = 68737
msec = 59174
msec = 65915

ct_rwmutex
msec = 62772
msec = 71139
msec = 68110
msec = 66038
msec = 60436
msec = 74238

I can also test on 2 CPU system with 88 hardware threads in total, but
the bench hardcodes 16 threads (there is std::hardware_concurrency()
or something like that).

I've found that it's critical to model realistic/target ratio between
reads/writes and amount of local work and work inside of critical
section in such benchmarks. Otherwise in 100% synthetic hammering
usually the worst mutex shows the best results. Synthetic benchmarks
produce extreme contention, so a mutex that blocks threads as much as
possible and allows as few threads as possible to work, shows the best
result because contention is minimized. The best would be to run all
threads sequentially one-by-one. But that's counter productive for
real life because blocked threads don't do useful work too.

Also this benchmark has fixed amount of work per thread, so I suspect
it may be subject to load imbalance when an unlucky outliner delays
total result. A better option would be to run all threads for X
seconds and then set a global var for all of them to stop and then
join all of them and measure number of iterations.

Also for me it lacked #include <climits>.

-- 

--- 
You received this message because you are subscribed to the Google Groups 
"Scalable Synchronization Algorithms" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to lock-free+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/lock-free/CAEeQi3toqsG_1UeXxV691FNiMSoBWEmi9gXnE5Jo_Fq6wUpsxA%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Re: [lock-free] Experimental Read/Write Mutex..

Reply via email to