On Sun, Feb 17, 2019 at 12:34 AM Chris M. Thomasson <cris...@charter.net> wrote: > > Fwiw, here is a simple benchmark for an older read/write algorithm I invented: > > > https://pastebin.com/raw/xCBHY9qd > > > It should compile fine on MSVC 2017 and GCC with -std=c++17. There is a macro > in the code called CT_TEST_FAST_MUTEX. If it is defined then the code will > test my algorithm. If not defined then it will test against > std::shared_mutex. I am wondering if anybody can compile and run the code? > Test it out on as many operating systems as you can. Can you please show the > output? > > > Thanks everybody. :^) > > Here is the code: > > /* Simple, crude read/write mutex test > by: Chris M. Thomasson > __________________________________________*/ > > > > #include <thread> > #include <atomic> > #include <shared_mutex> > #include <condition_variable> > #include <iostream> > #include <functional> > #include <cassert> > #include <cstdlib> > #include <ctime> > > > #define THREADS 16UL > #define ITERS 10000000UL > #define COUNT (THREADS * ITERS) > > > // undefine to test std::shared_mutex > #define CT_TEST_FAST_MUTEX 1 > > > // bare bones mutex/condvar based semaphore > struct ct_slow_semaphore > { > unsigned long m_state; > std::mutex m_mutex; > std::condition_variable m_cond; > > ct_slow_semaphore(unsigned long state) : m_state(state) {} > > void inc() > { > { > std::unique_lock<std::mutex> lock(m_mutex); > ++m_state; > } > > m_cond.notify_one(); > } > > void add(unsigned long addend) > { > { > std::unique_lock<std::mutex> lock(m_mutex); > m_state += addend; > } > > m_cond.notify_all(); > } > > void dec() > { > std::unique_lock<std::mutex> lock(m_mutex); > while (m_state == 0) m_cond.wait(lock); > --m_state; > } > }; > > > > > // bin-sema > struct ct_auto_reset_event > { > bool m_state; > std::mutex m_mutex; > std::condition_variable m_cond; > > ct_auto_reset_event() : m_state(false) {} > > void signal() > { > std::unique_lock<std::mutex> lock(m_mutex); > m_state = true; > m_cond.notify_one(); > } > > void wait() > { > std::unique_lock<std::mutex> lock(m_mutex); > while (m_state == false) m_cond.wait(lock); > m_state = false; // auto-reset > } > }; > > > // just a layer over an auto-reset event > struct ct_fast_mutex > { > std::atomic<unsigned int> m_state; > ct_auto_reset_event m_waitset; > > ct_fast_mutex() : m_state(0) {} > > void lock() > { > if (m_state.exchange(1, std::memory_order_acquire)) > { > while (m_state.exchange(2, std::memory_order_acquire)) > { > m_waitset.wait(); > } > } > } > > void unlock() > { > if (m_state.exchange(0, std::memory_order_release) == 2) > { > m_waitset.signal(); > } > } > }; > > > > // Chris M. Thomassons Experimental Read/Write Mutex > // Yeah, it is pretty damn fat wrt the state, however > // it has some interesting properties... > // The state can be compressed a bit... > // btw, it has no loops... > // Take a look at the lock_shared and unlock_shared functions > > #define RWMUTEX_COUNT_MAX LONG_MAX > > struct ct_rwmutex > { > // shared state > std::atomic<long> m_wrstate; > std::atomic<long> m_count; > std::atomic<long> m_rdwake; > > ct_slow_semaphore m_rdwset; > ct_slow_semaphore m_wrwset; > ct_fast_mutex m_wrlock; > > > ct_rwmutex() : > m_wrstate(1), > m_count(RWMUTEX_COUNT_MAX), > m_rdwake(0), > m_rdwset(0), > m_wrwset(0) { > } > > > // READ, pretty slim... > void lock_shared() > { > if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) > { > m_rdwset.dec(); > } > } > > void unlock_shared() > { > if (m_count.fetch_add(1, std::memory_order_release) < 0) > { > if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) > { > m_wrwset.inc(); > } > } > } > > > // WRITE, more hefty > void lock() > { > m_wrlock.lock(); > > long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, > std::memory_order_acquire); > > if (count < RWMUTEX_COUNT_MAX) > { > long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, > std::memory_order_acquire); > > if (rdwake + RWMUTEX_COUNT_MAX - count) > { > m_wrwset.dec(); > } > } > } > > // write unlock > void unlock() > { > long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, > std::memory_order_release); > > if (count < 0) > { > m_rdwset.add(-count); > } > > m_wrlock.unlock(); > } > }; > > > struct ct_shared > { > std::atomic<unsigned long> m_state; > > #if defined (CT_TEST_FAST_MUTEX) > ct_rwmutex m_std_rwmutex; > #else > std::shared_mutex m_std_rwmutex; > #endif > > ct_shared() : m_state(0) {} > }; > > > void ct_thread(ct_shared& shared, std::size_t index) > { > for (unsigned int i = 0; i < ITERS; ++i) > { > > shared.m_std_rwmutex.lock(); > if (i % 256 == 0) std::this_thread::yield(); > shared.m_state += 1; > shared.m_std_rwmutex.unlock(); > > > shared.m_std_rwmutex.lock_shared(); > if (i % 512 == 0) std::this_thread::yield(); > //shared.m_state += 1; > shared.m_std_rwmutex.unlock_shared(); > > } > } > > > int main() > { > ct_shared shared; > > std::cout << "Testing: "; > > #if defined (CT_TEST_FAST_MUTEX) > std::cout << "Chris M. Thomasson's Experimental Read/Write Mutex\n\n"; > #else > std::cout << "std::shared_mutex\n\n"; > #endif > > { > std::thread threads[THREADS]; > > std::clock_t start = std::clock(); > > for (std::size_t i = 0; i < THREADS; ++i) > { > threads[i] = std::thread(ct_thread, std::ref(shared), i); > } > > for (std::size_t i = 0; i < THREADS; ++i) > { > threads[i].join(); > } > > std::clock_t diff = clock() - start; > > unsigned long msec = diff * 1000 / CLOCKS_PER_SEC; > > std::cout << "msec = " << msec << "\n"; > } > > std::cout << "shared.m_state = " << shared.m_state << "\n"; > std::cout << "\n\nFin!\n"; > > assert(shared.m_state == COUNT); > > return 0; > }
Hey Chris! Here are my results on i7-8650U (4 HT cores) on Debian 4.19.16-1 gcc 7.3.0 std::shared_mutex msec = 56118 msec = 49149 msec = 69416 msec = 68737 msec = 59174 msec = 65915 ct_rwmutex msec = 62772 msec = 71139 msec = 68110 msec = 66038 msec = 60436 msec = 74238 I can also test on 2 CPU system with 88 hardware threads in total, but the bench hardcodes 16 threads (there is std::hardware_concurrency() or something like that). I've found that it's critical to model realistic/target ratio between reads/writes and amount of local work and work inside of critical section in such benchmarks. Otherwise in 100% synthetic hammering usually the worst mutex shows the best results. Synthetic benchmarks produce extreme contention, so a mutex that blocks threads as much as possible and allows as few threads as possible to work, shows the best result because contention is minimized. The best would be to run all threads sequentially one-by-one. But that's counter productive for real life because blocked threads don't do useful work too. Also this benchmark has fixed amount of work per thread, so I suspect it may be subject to load imbalance when an unlucky outliner delays total result. A better option would be to run all threads for X seconds and then set a global var for all of them to stop and then join all of them and measure number of iterations. Also for me it lacked #include <climits>. -- --- You received this message because you are subscribed to the Google Groups "Scalable Synchronization Algorithms" group. To unsubscribe from this group and stop receiving emails from it, send an email to lock-free+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/lock-free/CAEeQi3toqsG_1UeXxV691FNiMSoBWEmi9gXnE5Jo_Fq6wUpsxA%40mail.gmail.com. For more options, visit https://groups.google.com/d/optout.