[GitHub] asitstands commented on a change in pull request #10048: [MXNET-68] Random shuffle implementation

GitBox Mon, 12 Mar 2018 09:21:38 -0700

asitstands commented on a change in pull request #10048: [MXNET-68] Random 
shuffle implementation
URL: https://github.com/apache/incubator-mxnet/pull/10048#discussion_r173857649


 ##########
 File path: src/operator/random/shuffle_op.cc
 ##########
 @@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file shuffle_op.cc
+ * \brief Operator to shuffle elements of an NDArray
+ */
+#if (__GNUC__ > 4 && !defined(__clang__major__)) || (__clang_major__ > 4 && 
__linux__)
+  #define USE_GNU_PARALLEL_SHUFFLE
+#endif
+
+#include <mxnet/operator_util.h>
+#include <algorithm>
+#include <random>
+#include <vector>
+#ifdef USE_GNU_PARALLEL_SHUFFLE
+  #include <parallel/algorithm>
+#endif
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace {
+
+template<typename DType, typename Rand>
+void Shuffle1D(DType* const out, const index_t size, Rand* const prnd) {
+  #ifdef USE_GNU_PARALLEL_SHUFFLE
+    auto rand_n = [prnd](index_t n) {
+      std::uniform_int_distribution<index_t> dist(0, n - 1);
+      return dist(*prnd);
+    };
+    __gnu_parallel::random_shuffle(out, out + size, rand_n);
+  #else
+    std::shuffle(out, out + size, *prnd);
+  #endif
+}
+
+template<typename DType, typename Rand>
+void ShuffleND(DType* const out, const index_t size, const index_t 
first_axis_len,
+                Rand* const prnd) {
+  // Fisher-Yates shuffling
+  const index_t stride = size / first_axis_len;
+  auto rand_n = [prnd](index_t n) {
+    std::uniform_int_distribution<index_t> dist(0, n - 1);
+    return dist(*prnd);
+  };
+  CHECK_GT(first_axis_len, 0U);
+  for (index_t i = first_axis_len - 1; i > 0; --i) {
+    const index_t j = rand_n(i + 1);
+    if (i != j) {
+      std::swap_ranges(out + stride * i, out + stride * (i + 1), out + stride 
* j);
 
 Review comment:
   I guess that the optimization may be not trivial. Anyway here are some tests 
with a very naive parallelization with openmp. It simply splits the ranges to 
swap into multiple ranges and gives each piece to an openmp thread. Multiple 
threads benefits arrays with large number of elements per row when they run on 
two Xeon E5-2680 CPUs, but there is no gain when run on single i7-7700. For 
small arrays, multiple threads very poorly performs in either CPUs. There could 
be more sophisticated optimizations for this kind of memory IO, but I have no 
idea.
   
   Test with Xeon E5-2680 two CPUs.
   
   ```
   # ./a.out num_rows num_cols num_repeats num_threads
   # measures the running time of the two implementations.
   
   > ./a.out 100 10000000 10 4
   multi  : 3861601 ms
   single : 9080845 ms
   
   > ./a.out 100 1000000 10 4
   multi  : 338396 ms
   single : 861971 ms
   
   > ./a.out 100 100000 10 4
   multi  : 21387 ms
   single : 57533 ms
   
   > ./a.out 100 10000 10 4
   multi  : 6956 ms
   single : 4073 ms
   
   > ./a.out 100 1000 10 4
   multi  : 5886 ms
   single : 597 ms
   
   > ./a.out 100 100 10 4
   multi  : 4606 ms
   single : 139 ms
   ```
   
   Test with i7-7700.
   
   ```
   
   > ./a.out 100 10000000 10 4
   multi  : 10015002 ms
   single : 9327057 ms
   
   > ./a.out 100 1000000 10 4
   multi  : 969582 ms
   single : 918764 ms
   
   > ./a.out 100 100000 10 4
   multi  : 77717 ms
   single : 75001 ms
   
   > ./a.out 100 10000 10 4
   multi  : 1850 ms
   single : 2016 ms
   
   > ./a.out 100 1000 10 4
   multi  : 1911 ms
   single : 209 ms
   
   > ./a.out 100 10000000 10 2
   multi  : 9478994 ms
   single : 9451969 ms
   
   > ./a.out 100 1000000 10 2
   multi  : 936728 ms
   single : 918129 ms
   
   > ./a.out 100 100000 10 2
   multi  : 75222 ms
   single : 75331 ms
   
   > ./a.out 100 10000 10 2
   multi  : 1885 ms
   single : 1953 ms
   
   > ./a.out 100 1000 10 2
   multi  : 1425 ms
   single : 204 ms
   ```
   Here is the code.
   
   ```c++
   #include <iostream>
   #include <algorithm>
   #include <random>
   #include <chrono>
   
   using index_t = unsigned int;
   
   // The current implementation
   template<typename DType, typename Rand>
   void ShuffleND(DType* const out, const index_t size,
                  const index_t first_axis_len, Rand* const prnd) {
     const index_t stride = size / first_axis_len;
     auto rand_n = [prnd](index_t n) {
       std::uniform_int_distribution<index_t> dist(0, n - 1);
       return dist(*prnd);
     };
     for (index_t i = first_axis_len - 1; i > 0; --i) {
       const index_t j = rand_n(i + 1);
       if (i != j) {
         std::swap_ranges(out + stride * i, out + stride * (i + 1), out + 
stride * j);
       }
     }
   }
   
   // Naive parallelization with openmp
   template<typename DType, typename Rand>
   void ShuffleND_M(const unsigned int n_threads, DType* const out, const 
index_t size,
                    const index_t first_axis_len, Rand* const prnd) {
     const index_t stride = size / first_axis_len;
     auto rand_n = [prnd](index_t n) {
       std::uniform_int_distribution<index_t> dist(0, n - 1);
       return dist(*prnd);
     };
     for (index_t i = first_axis_len - 1; i > 0; --i) {
       const index_t j = rand_n(i + 1);
       if (i != j) {
         // This loop is different from the current implementation.
         #pragma omp parallel for num_threads(n_threads)
         for(unsigned int k = 0; k < n_threads; ++k) {
           std::swap_ranges(out + stride * i + k * stride / n_threads,
                            out + stride * i + (k + 1) * stride / n_threads,
                            out + stride * j + k * stride / n_threads);
         }
       }
     }
   }
   
   int main(int argc, char* argv[]) {
     using namespace std;
     using namespace std::chrono;
   
     const size_t n_rows = stol(argv[1]);
     const size_t n_cols = stol(argv[2]);
     const size_t n_repeats = stol(argv[3]);
     const unsigned int n_threads = stol(argv[4]);
   
     vector<float> vec(n_rows * n_cols);
     iota(vec.begin(), vec.end(), 0);
     mt19937 rnd((random_device())());
   
     high_resolution_clock::time_point t1;
     high_resolution_clock::time_point t2;
   
     t1 = high_resolution_clock::now();
     for(unsigned int i = 0; i < n_repeats; ++i) {
       ShuffleND_M(n_threads, vec.data(), vec.size(), n_rows, &rnd);
     }
     t2 = high_resolution_clock::now();
     cout << "multi  : " << duration_cast<microseconds>(t2 - t1).count() << " 
ms" << endl;
   
     t1 = high_resolution_clock::now();
     for(unsigned int i = 0; i < n_repeats; ++i) {
       ShuffleND(vec.data(), vec.size(), n_rows, &rnd);
     }
     t2 = high_resolution_clock::now();
     cout << "single : " << duration_cast<microseconds>(t2 - t1).count() << " 
ms" << endl;
   
     return 0;
   }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] asitstands commented on a change in pull request #10048: [MXNET-68] Random shuffle implementation

Reply via email to