[arrow] branch master updated: ARROW-4234: [C++] Improve memory bandwidth test

apitrou Wed, 23 Jan 2019 07:31:13 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new d5fe8e5  ARROW-4234: [C++] Improve memory bandwidth test
d5fe8e5 is described below

commit d5fe8e5c6789e1eac484d6a6d4d8c487ec89e126
Author: François Saint-Jacques <[email protected]>
AuthorDate: Wed Jan 23 16:30:29 2019 +0100

    ARROW-4234: [C++] Improve memory bandwidth test
    
    - Kept the existing memcopy benchmark, but made the number of threads a 
benchmark variable.
    - Added 3 explicit pure bandwidth tests: Read, Write, ReadWrite.
    
    Author: François Saint-Jacques <[email protected]>
    
    Closes #3378 from fsaintjacques/ARROW-4234-memory-bandwidth and squashes 
the following commits:
    
    576e1161 <François Saint-Jacques> Cast
    16807177 <François Saint-Jacques> Make clang happy, take 2
    7495f2af <François Saint-Jacques> Fix for windows and CLANG warning
    58064ecb <François Saint-Jacques> Fix lint issues.
    9a15cc55 <François Saint-Jacques> Add Read/Write/ReadWrite test
    d8c0b491 <François Saint-Jacques> ARROW-4234:  Improve memory bandwidth test
---
 cpp/src/arrow/io/memory-benchmark.cc | 99 +++++++++++++++++++++++++++---------
 1 file changed, 75 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/io/memory-benchmark.cc 
b/cpp/src/arrow/io/memory-benchmark.cc
index 72a5dc8..b36be4d 100644
--- a/cpp/src/arrow/io/memory-benchmark.cc
+++ b/cpp/src/arrow/io/memory-benchmark.cc
@@ -15,50 +15,101 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include <iostream>
+
 #include "arrow/api.h"
 #include "arrow/io/memory.h"
 #include "arrow/test-util.h"
+#include "arrow/util/cpu-info.h"
 
 #include "benchmark/benchmark.h"
 
-#include <iostream>
-
 namespace arrow {
 
-static void BM_SerialMemcopy(benchmark::State& state) {  // NOLINT non-const 
reference
-  constexpr int64_t kTotalSize = 100 * 1024 * 1024;      // 100MB
+static const int kNumCores = internal::CpuInfo::GetInstance()->num_cores();
+constexpr size_t kMemoryPerCore = 32 * 1024 * 1024;
+using BufferPtr = std::shared_ptr<Buffer>;
+
+using VectorType = __m128i;
+
+// See 
http://codearcana.com/posts/2013/05/18/achieving-maximum-memory-bandwidth.html
+// for the usage of stream loads/writes. Or section 6.1, page 47 of
+// https://akkadia.org/drepper/cpumemory.pdf .
+
+static void Read(void* src, void* dst, size_t size) {
+  auto simd = static_cast<VectorType*>(src);
+  (void)dst;
+
+  for (size_t i = 0; i < size / sizeof(VectorType); i++)
+    benchmark::DoNotOptimize(_mm_stream_load_si128(&simd[i]));
+}
+
+static void Write(void* src, void* dst, size_t size) {
+  auto simd = static_cast<VectorType*>(dst);
+  const VectorType ones = _mm_set1_epi32(1);
+  (void)src;
 
-  std::shared_ptr<Buffer> buffer1, buffer2;
-  ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1));
-  ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2));
-  random_bytes(kTotalSize, 0, buffer2->mutable_data());
+  for (size_t i = 0; i < size / sizeof(VectorType); i++) 
_mm_stream_si128(&simd[i], ones);
+}
+
+static void ReadWrite(void* src, void* dst, size_t size) {
+  auto src_simd = static_cast<VectorType*>(src);
+  auto dst_simd = static_cast<VectorType*>(dst);
+
+  for (size_t i = 0; i < size / sizeof(VectorType); i++)
+    _mm_stream_si128(&dst_simd[i], _mm_stream_load_si128(&src_simd[i]));
+}
+
+using ApplyFn = decltype(Read);
+
+template <ApplyFn Apply>
+static void MemoryBandwidth(benchmark::State& state) {  // NOLINT non-const 
reference
+  const size_t buffer_size = kMemoryPerCore;
+  BufferPtr src, dst;
+
+  ABORT_NOT_OK(AllocateBuffer(buffer_size, &src));
+  ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst));
+  random_bytes(buffer_size, 0, src->mutable_data());
 
   while (state.KeepRunning()) {
-    io::FixedSizeBufferWriter writer(buffer1);
-    ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size()));
+    Apply(src->mutable_data(), dst->mutable_data(), buffer_size);
   }
-  state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize);
+
+  state.SetBytesProcessed(state.iterations() * buffer_size);
 }
 
-static void BM_ParallelMemcopy(benchmark::State& state) {  // NOLINT non-const 
reference
-  constexpr int64_t kTotalSize = 100 * 1024 * 1024;        // 100MB
+// `UseRealTime` is required due to threads, otherwise the cumulative CPU time
+// is used which will skew the results by the number of threads.
+BENCHMARK_TEMPLATE(MemoryBandwidth, Read)->ThreadRange(1, 
kNumCores)->UseRealTime();
+BENCHMARK_TEMPLATE(MemoryBandwidth, Write)->ThreadRange(1, 
kNumCores)->UseRealTime();
+BENCHMARK_TEMPLATE(MemoryBandwidth, ReadWrite)->ThreadRange(1, 
kNumCores)->UseRealTime();
+
+static void ParallelMemoryCopy(benchmark::State& state) {  // NOLINT non-const 
reference
+  const int64_t n_threads = state.range(0);
+  const int64_t buffer_size = kMemoryPerCore;
 
-  std::shared_ptr<Buffer> buffer1, buffer2;
-  ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer1));
-  ABORT_NOT_OK(AllocateBuffer(kTotalSize, &buffer2));
+  std::shared_ptr<Buffer> src, dst;
+  ABORT_NOT_OK(AllocateBuffer(buffer_size, &src));
+  ABORT_NOT_OK(AllocateBuffer(buffer_size, &dst));
 
-  random_bytes(kTotalSize, 0, buffer2->mutable_data());
+  random_bytes(buffer_size, 0, src->mutable_data());
 
   while (state.KeepRunning()) {
-    io::FixedSizeBufferWriter writer(buffer1);
-    writer.set_memcopy_threads(4);
-    ABORT_NOT_OK(writer.Write(buffer2->data(), buffer2->size()));
+    io::FixedSizeBufferWriter writer(dst);
+    writer.set_memcopy_threads(static_cast<int>(n_threads));
+    ABORT_NOT_OK(writer.Write(src->data(), src->size()));
   }
-  state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize);
-}
 
-BENCHMARK(BM_SerialMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime();
+  state.SetBytesProcessed(int64_t(state.iterations()) * buffer_size);
+  state.counters["threads"] = static_cast<double>(n_threads);
+}
 
-BENCHMARK(BM_ParallelMemcopy)->MinTime(1.0)->Repetitions(2)->UseRealTime();
+BENCHMARK(ParallelMemoryCopy)->RangeMultiplier(2)->Range(1, 
kNumCores)->UseRealTime();
 
 }  // namespace arrow

[arrow] branch master updated: ARROW-4234: [C++] Improve memory bandwidth test

Reply via email to