ptrendx commented on a change in pull request #16542: Faster GPU NMS operator
URL: https://github.com/apache/incubator-mxnet/pull/16542#discussion_r338838384
 
 

 ##########
 File path: src/operator/contrib/bounding_box.cu
 ##########
 @@ -24,14 +24,701 @@
   * \author Joshua Zhang
   */
 
+#include <cub/cub.cuh>
+
 #include "./bounding_box-inl.cuh"
 #include "./bounding_box-inl.h"
 #include "../elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
+
+namespace {
+
+using mshadow::Tensor;
+using mshadow::Stream;
+
+template <typename DType>
+struct TempWorkspace {
+  index_t scores_temp_space;
+  DType* scores;
+  index_t scratch_space;
+  uint8_t* scratch;
+  index_t buffer_space;
+  DType* buffer;
+  index_t nms_scratch_space;
+  uint32_t* nms_scratch;
+  index_t indices_temp_spaces;
+  index_t* indices;
+};
+
+inline index_t ceil_div(index_t x, index_t y) {
+  return (x + y - 1) / y;
+}
+
+inline index_t align(index_t x, index_t alignment) {
+  return ceil_div(x, alignment)  * alignment;
+}
+
+template <typename DType>
+__global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, 
DType* scores,
+                                               index_t num_elements_per_batch,
+                                               const index_t element_width,
+                                               const index_t N,
+                                               const float threshold,
+                                               const int id_index, const int 
score_index,
+                                               const int background_id) {
+  index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  bool first_in_element = (tid % element_width == 0);
+  index_t start_of_my_element = tid - (tid % element_width);
+
+  if (tid < N) {
+    DType my_score = data[start_of_my_element + score_index];
+    bool filtered_out = my_score <= threshold;
+    if (id_index != -1 && background_id != -1) {
+      DType my_id = data[start_of_my_element + id_index];
+      filtered_out = filtered_out || (my_id == background_id);
+    }
+    if (!filtered_out) {
+      out[tid] = data[tid];
+    } else {
+      out[tid] = -1;
+      my_score = -1;
+    }
+
+    if (first_in_element) {
+      index_t offset = tid / element_width;
+      scores[offset] = my_score;
+    }
+  }
+}
+
+template <typename DType>
+void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
+                             Tensor<gpu, 3, DType>* out,
+                             const TempWorkspace<DType>& workspace,
+                             const BoxNMSParam& param,
+                             Stream<gpu>* s) {
+  const int n_threads = 512;
+  index_t N = data.shape_.Size();
+  const auto blocks = ceil_div(N, n_threads);
+  FilterAndPrepareAuxData_kernel<<<blocks,
+                                   n_threads,
+                                   0,
+                                   Stream<gpu>::GetStream(s)>>>(
+    data.dptr_, out->dptr_, workspace.scores,
+    data.shape_[1], data.shape_[2], N,
+    param.valid_thresh, param.id_index,
+    param.score_index, param.background_id);
+}
+
+template <bool check_topk, bool check_score, typename DType>
+__global__ void CompactData_kernel(const index_t* indices, const DType* source,
 
 Review comment:
   Will change.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to