On Wed, Feb 20, 2019 at 7:51 AM Chris M. Thomasson <cris...@charter.net> wrote: > > Fwiw, I wrote a crude new benchmark that measures how many reads and writes > can be performed in a given amount of time. My algorithm vs > std::shared_mutex. So, we are _primarily_ looking for how many reads can be > performed in this test at 60 seconds. The number of threads is variable and > is determined by std::hardware_concurrency * THREADS, set at 8 in the test. > So on my system the setup is: > ___________________________________ > cpu_threads_n = 4 > threads_n = 32 > writers = 16 > readers = 16 > test duration = 60 seconds > ___________________________________ > > This is going to be different for each system. Readers iterate a shared > linked list with 1000000 nodes in this test. Writers pop and push items, and > never use new or delete. Well, so far, the timings I am getting on my end > using MSVC 2017 is: > > > Testing: Chris M. Thomasson's Experimental Read/Write Mutex > > ___________________________________ > cpu_threads_n = 4 > threads_n = 32 > writers = 16 > readers = 16 > test duration = 60 seconds > ___________________________________ > > > ___________________________________ > Raw Reads: 18046464 > Raw Writes: 347498 > reads_per_tick = 300693 > writes_per_tick = 5790 > Ticks = 60.0161 > ___________________________________ > > > > Testing: std::shared_mutex > > ___________________________________ > cpu_threads_n = 4 > threads_n = 32 > writers = 16 > readers = 16 > test duration = 60 seconds > ___________________________________ > > > ___________________________________ > Raw Reads: 650006 > Raw Writes: 168741 > reads_per_tick = 10829 > writes_per_tick = 2811 > Ticks = 60.0193 > ___________________________________ > > > > As you can see, my algorithm is performing many more reads than > std::shared_mutex. Anyway, here is my code: > > When you get some time, can you please give it a go? I want to see timings on > other systems, the good, bad and ugly. ;^) Thanks. > > https://pastebin.com/raw/1QtPCGhV > ___________________________________ > /* Crude Read/Write Mutex Benchmark > > This tests my algorithm ct_rwmutex vs. std::shared_mutex. > > It shows how many reads and writes can be performed within > a fixed amount of time. > > by: Chris M. Thomasson > __________________________________________*/ > > > > #include <thread> > #include <atomic> > #include <shared_mutex> > #include <condition_variable> > #include <iostream> > #include <functional> > #include <chrono> > #include <cassert> > #include <cstdlib> > #include <ctime> > #include <climits> > #include <cstdint> > #include <vector> > > > // undefine to test std::shared_mutex > #define CT_TEST_FAST_MUTEX 1 > > > #define THREADS 8 // multiplied by std::hardware_concurrency > #define NODE_PRIME 1000000 // number of nodes > #define CRUDE_CACHE_PAD 256 // crude cache pad > #define TEST_DURATION_SEC 60 // number of seconds for the test > > > // bare bones mutex/condvar based semaphore > struct ct_slow_semaphore > { > unsigned long m_state; > std::mutex m_mutex; > std::condition_variable m_cond; > > ct_slow_semaphore(unsigned long state) : m_state(state) {} > > void inc() > { > m_mutex.lock(); > ++m_state; > m_mutex.unlock(); > m_cond.notify_one(); > } > > void add(unsigned long addend) > { > { > std::unique_lock<std::mutex> lock(m_mutex); > m_state += addend; > } > > m_cond.notify_all(); > } > > void dec() > { > std::unique_lock<std::mutex> lock(m_mutex); > while (m_state == 0) m_cond.wait(lock); > --m_state; > } > }; > > > > > // bin-sema > struct ct_auto_reset_event > { > bool m_state; > std::mutex m_mutex; > std::condition_variable m_cond; > > ct_auto_reset_event() : m_state(false) {} > > void signal() > { > std::unique_lock<std::mutex> lock(m_mutex); > m_state = true; > m_cond.notify_one(); > } > > void wait() > { > std::unique_lock<std::mutex> lock(m_mutex); > while (m_state == false) m_cond.wait(lock); > m_state = false; // auto-reset > } > }; > > > > > // just a layer over an auto-reset event > struct ct_fast_mutex > { > std::atomic<unsigned int> m_state; > ct_auto_reset_event m_waitset; > > ct_fast_mutex() : m_state(0) {} > > void lock() > { > if (m_state.exchange(1, std::memory_order_acquire)) > { > while (m_state.exchange(2, std::memory_order_acquire)) > { > m_waitset.wait(); > } > } > } > > void unlock() > { > if (m_state.exchange(0, std::memory_order_release) == 2) > { > m_waitset.signal(); > } > } > }; > > > > // Chris M. Thomassons Experimental Read/Write Mutex > // Yeah, it is pretty damn fat wrt the state, however > // it has some interesting properties... > // The state can be compressed a bit... > // btw, it has no loops... > // Take a look at the lock_shared and unlock_shared functions > > // The number of readers _must_ never exceed LONG_MAX! > > #define RWMUTEX_COUNT_MAX LONG_MAX > > struct ct_rwmutex > { > unsigned char m_crude_cache_pad_0[CRUDE_CACHE_PAD]; > > // shared state > std::atomic<long> m_count; > unsigned char m_crude_cache_pad_1[CRUDE_CACHE_PAD]; > > std::atomic<long> m_rdwake; > unsigned char m_crude_cache_pad_2[CRUDE_CACHE_PAD]; > > ct_slow_semaphore m_rdwset; > unsigned char m_crude_cache_pad_3[CRUDE_CACHE_PAD]; > > ct_slow_semaphore m_wrwset; > unsigned char m_crude_cache_pad_4[CRUDE_CACHE_PAD]; > > ct_fast_mutex m_wrlock; > unsigned char m_crude_cache_pad_5[CRUDE_CACHE_PAD]; > > > ct_rwmutex() : > m_count(RWMUTEX_COUNT_MAX), > m_rdwake(0), > m_rdwset(0), > m_wrwset(0) { > } > > > // READ, pretty slim... > void lock_shared() > { > if (m_count.fetch_add(-1, std::memory_order_acquire) < 1) > { > // We need to wait for a writer. > m_rdwset.dec(); > } > } > > void unlock_shared() > { > if (m_count.fetch_add(1, std::memory_order_release) < 0) > { > // There is a writer > if (m_rdwake.fetch_add(-1, std::memory_order_acq_rel) == 1) > { > // We need to wake the writer up > m_wrwset.inc(); > } > } > } > > > // WRITE, more hefty > void lock() > { > // Acquire exclusive access > m_wrlock.lock(); > > // we are the only thread in here now. > > // Gain write access wrt m_count > long count = m_count.fetch_add(-RWMUTEX_COUNT_MAX, > std::memory_order_acquire); > > // count can never be negative. > if (count < RWMUTEX_COUNT_MAX) > { > // We detected readers. > long rdwake = m_rdwake.fetch_add(RWMUTEX_COUNT_MAX - count, > std::memory_order_acquire); > > if (rdwake + RWMUTEX_COUNT_MAX - count) > { > // Okay, we need to wait for all of the readers > // The number of readers is actually > // RWMUTEX_COUNT_MAX - count > m_wrwset.dec(); > } > } > } > > // write unlock > void unlock() > { > // Release write access wrt m_count > long count = m_count.fetch_add(RWMUTEX_COUNT_MAX, > std::memory_order_release); > > if (count < 0) > { > // We need to wake -count readers. > m_rdwset.add(-count); > } > > // Release exclusive access > m_wrlock.unlock(); > } > }; > > > struct ct_simple_stack > { > struct node > { > node* m_next; > unsigned int m_tid; > > node(unsigned int tid) : m_tid(tid) {} > }; > > node* m_head; > > ct_simple_stack() : m_head(nullptr) {} > > ~ct_simple_stack() > { > ct_simple_stack::node* n = flush(); > > while (n) > { > ct_simple_stack::node* next = n->m_next; > delete n; > n = next; > } > } > > void push(node* n) > { > n->m_next = m_head; > m_head = n; > } > > node* pop() > { > node* n = m_head; > > if (n) > { > m_head = n->m_next; > } > > return n; > } > > > node* flush() > { > node* n = m_head; > m_head = nullptr; > return n; > } > > }; > > > struct ct_shared > { > std::atomic<bool> m_run; > > // protected by m_std_rwmutex > std::uint64_t m_reads; > std::uint64_t m_writes; > > ct_simple_stack m_stack; > ct_simple_stack m_stack_dtor; > > #if defined (CT_TEST_FAST_MUTEX) > ct_rwmutex m_std_rwmutex; > #else > std::shared_mutex m_std_rwmutex; > #endif > > ct_shared() : m_run(true), m_reads(0), m_writes(0) > { > // prime m_stack > for (unsigned int i = 0; i < NODE_PRIME; ++i) > { > m_stack.push(new ct_simple_stack::node(i)); > } > } > }; > > > void ct_thread_reader(ct_shared& shared, std::size_t tidx) > { > std::uint64_t reads = 0; > > while (shared.m_run.load(std::memory_order_relaxed)) > { > shared.m_std_rwmutex.lock_shared(); > > ct_simple_stack::node* n = shared.m_stack.m_head; > > while (n) > { > ct_simple_stack::node* next = n->m_next; > > n = next; > } > > std::this_thread::yield(); > > shared.m_std_rwmutex.unlock_shared(); > > ++reads; > } > > shared.m_std_rwmutex.lock(); > shared.m_reads += reads; > shared.m_std_rwmutex.unlock(); > } > > > void ct_thread_writer(ct_shared& shared, std::size_t tidx) > { > std::uint64_t writes = 0; > > while (shared.m_run.load(std::memory_order_relaxed)) > { > shared.m_std_rwmutex.lock(); > ct_simple_stack::node* n = shared.m_stack.pop(); > shared.m_std_rwmutex.unlock(); > > std::this_thread::yield(); > > shared.m_std_rwmutex.lock(); > > if (n) > { > shared.m_stack.push(n); > } > > shared.m_std_rwmutex.unlock(); > > std::this_thread::yield(); > > ++writes; > } > > shared.m_std_rwmutex.lock(); > shared.m_writes += writes; > shared.m_std_rwmutex.unlock(); > } > > > int main() > { > ct_shared shared; > > std::cout << "Testing: "; > > #if defined (CT_TEST_FAST_MUTEX) > std::cout << "Chris M. Thomasson's Experimental Read/Write Mutex\n\n"; > #else > std::cout << "std::shared_mutex\n\n"; > #endif > > std::cout.flush(); > > { > // Setup our threads > std::size_t cpu_threads_n = std::thread::hardware_concurrency(); > std::size_t threads_n = cpu_threads_n * THREADS; > std::vector<std::thread> threads(threads_n); > > std::size_t writers = threads_n / 2; > std::size_t readers = threads_n - writers; > > std::cout << "___________________________________\n"; > std::cout << "cpu_threads_n = " << cpu_threads_n << "\n"; > std::cout << "threads_n = " << threads_n << "\n"; > std::cout << "writers = " << writers << "\n"; > std::cout << "readers = " << readers << "\n"; > std::cout << "test duration = " << TEST_DURATION_SEC << " seconds\n"; > std::cout << "___________________________________\n\n\n"; > > auto time_start = std::chrono::high_resolution_clock::now(); > > // Create threads > for (std::size_t i = 0; i < threads_n; ++i) > { > if (i < writers) > { > threads[i] = std::thread(ct_thread_writer, std::ref(shared), > i); > } > > else > { > threads[i] = std::thread(ct_thread_reader, std::ref(shared), > i); > } > } > > // Give the test some time to run... > std::chrono::seconds time_duration(TEST_DURATION_SEC); > std::this_thread::sleep_for(time_duration); > shared.m_run.store(false, std::memory_order_relaxed); > > // Join threads > for (std::size_t i = 0; i < threads_n; ++i) > { > threads[i].join(); > } > > // Grab our timing. > auto time_stop = std::chrono::high_resolution_clock::now(); > > std::chrono::duration<double> time_diff = time_stop - time_start; > > std::uint64_t raw_reads = shared.m_reads; > std::uint64_t raw_writes = shared.m_writes; > > std::cout << "___________________________________\n"; > std::cout << "Raw Reads: " << raw_reads << "\n"; > std::cout << "Raw Writes: " << raw_writes << "\n"; > > std::uint64_t reads_per_tick = raw_reads / time_diff.count(); > std::uint64_t writes_per_tick = raw_writes / time_diff.count(); > > std::cout << "reads_per_tick = " << reads_per_tick << "\n"; > std::cout << "writes_per_tick = " << writes_per_tick << "\n"; > > std::cout << "Ticks = " << time_diff.count() << "\n"; > std::cout << "___________________________________\n\n"; > } > > std::cout << "\n\nFin!\n"; > > return 0; > } > ___________________________________
Benchmarked this on 2 systems (3 alternating runs for each mutex): 1. i7-8650U Testing Version 0.1: Chris M. Thomasson's Experimental Read/Write Mutex ___________________________________ cpu_threads_n = 8 threads_n = 64 writers = 32 readers = 32 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 100855 Raw Writes: 2158 reads_per_tick = 1680 writes_per_tick = 35 Ticks = 60.019 ___________________________________ ___________________________________ Raw Reads: 108197 Raw Writes: 2126 reads_per_tick = 1802 writes_per_tick = 35 Ticks = 60.0109 ___________________________________ ___________________________________ Raw Reads: 118111 Raw Writes: 2397 reads_per_tick = 1967 writes_per_tick = 39 Ticks = 60.0193 ___________________________________ Testing Version 0.1: std::shared_mutex ___________________________________ cpu_threads_n = 8 threads_n = 64 writers = 32 readers = 32 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 97667 Raw Writes: 766 reads_per_tick = 1627 writes_per_tick = 12 Ticks = 60.0066 ___________________________________ ___________________________________ Raw Reads: 96409 Raw Writes: 1162 reads_per_tick = 1606 writes_per_tick = 19 Ticks = 60.0094 ___________________________________ ___________________________________ Raw Reads: 95314 Raw Writes: 792 reads_per_tick = 1588 writes_per_tick = 13 Ticks = 60.0055 ___________________________________ 2. 2 x Xeon 6154 Testing Version 0.1: Chris M. Thomasson's Experimental Read/Write Mutex ___________________________________ cpu_threads_n = 72 threads_n = 576 writers = 288 readers = 288 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 181439 Raw Writes: 3116 reads_per_tick = 3014 writes_per_tick = 51 Ticks = 60.1803 ___________________________________ ___________________________________ Raw Reads: 179390 Raw Writes: 3343 reads_per_tick = 2980 writes_per_tick = 55 Ticks = 60.1921 ___________________________________ ___________________________________ Raw Reads: 179058 Raw Writes: 3262 reads_per_tick = 2975 writes_per_tick = 54 Ticks = 60.1714 ___________________________________ Testing Version 0.1: std::shared_mutex ___________________________________ cpu_threads_n = 72 threads_n = 576 writers = 288 readers = 288 test duration = 60 seconds ___________________________________ ___________________________________ Raw Reads: 1061634 Raw Writes: 1385 reads_per_tick = 17657 writes_per_tick = 23 Ticks = 60.1246 ___________________________________ ___________________________________ Raw Reads: 1056793 Raw Writes: 1368 reads_per_tick = 17601 writes_per_tick = 22 Ticks = 60.0405 ___________________________________ ___________________________________ Raw Reads: 1068157 Raw Writes: 1312 reads_per_tick = 17770 writes_per_tick = 21 Ticks = 60.1086 ___________________________________ Now it is your problem to interpret this :) Another interesting data point is time output. On the large system for your mutex: 848.76user 10.83system 1:00.26elapsed 1426%CPU for std mutex: 4238.27user 26.79system 1:00.18elapsed 7086%CPU So whatever your mutex did, it used 5 times less CPU time for that. -- --- You received this message because you are subscribed to the Google Groups "Scalable Synchronization Algorithms" group. To unsubscribe from this group and stop receiving emails from it, send an email to lock-free+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/lock-free/CAEeQi3u32VzC2n5haoDhNrWjQ17EWtYHtyPPR5Tow74xTnSpcw%40mail.gmail.com. For more options, visit https://groups.google.com/d/optout.