This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git


The following commit(s) were added to refs/heads/main by this push:
     new 346ab2ca feat(c/sedona-libgpuspatial): Fix synchronization bugs and 
improve logging (#721)
346ab2ca is described below

commit 346ab2cacd85f5d007fba68d77af2faaa2728c76
Author: Liang Geng <[email protected]>
AuthorDate: Wed Mar 18 12:15:13 2026 +0800

    feat(c/sedona-libgpuspatial): Fix synchronization bugs and improve logging 
(#721)
---
 .../libgpuspatial/CMakeLists.txt                   |   5 +
 .../include/gpuspatial/geom/polygon.hpp            |   1 +
 .../include/gpuspatial/index/rt_spatial_index.cuh  |  10 +-
 .../gpuspatial/refine/rt_spatial_refiner.cuh       |   9 +-
 .../include/gpuspatial/utils/gpu_timer.hpp         |  57 ---------
 .../include/gpuspatial/utils/stopwatch.hpp         |   5 +-
 .../libgpuspatial/src/memory_manager.cc            |  17 ++-
 .../libgpuspatial/src/relate_engine.cu             |  19 +--
 .../libgpuspatial/src/rt_spatial_index.cu          | 129 ++++++++++++---------
 .../libgpuspatial/src/rt_spatial_refiner.cu        |  77 +++++++++---
 10 files changed, 176 insertions(+), 153 deletions(-)

diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt 
b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
index eab27248..65e554c3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
@@ -17,6 +17,7 @@
 cmake_minimum_required(VERSION 3.30.4)
 
 option(GPUSPATIAL_BUILD_TESTS "Build tests" OFF)
+option(GPUSPATIAL_PROFILING "Enable profiling" OFF)
 
 # This must be set before project() to be picked up by vcpkg
 if(GPUSPATIAL_BUILD_TESTS)
@@ -163,6 +164,10 @@ target_compile_options(gpuspatial
                                
-DGPUSPATIAL_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBGPUSPATIAL_LOGGING_LEVEL}
 )
 
+if(GPUSPATIAL_PROFILING)
+  target_compile_definitions(gpuspatial PUBLIC GPUSPATIAL_PROFILING)
+endif()
+
 add_library(gpuspatial_c src/gpuspatial_c.cc)
 target_link_libraries(gpuspatial_c PUBLIC gpuspatial)
 
diff --git 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
index e457a8fb..076ced2c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
@@ -26,6 +26,7 @@
 #include <cub/warp/warp_reduce.cuh>
 
 #include <thrust/binary_search.h>
+#include <thrust/distance.h>
 
 namespace gpuspatial {
 
diff --git 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
index baaeb77f..aa993c86 100644
--- 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
+++ 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
@@ -19,13 +19,11 @@
 #include "gpuspatial/index/rt_spatial_index.hpp"
 #include "gpuspatial/index/spatial_index.hpp"
 #include "gpuspatial/rt/rt_engine.hpp"
-#include "gpuspatial/utils/gpu_timer.hpp"
 #include "gpuspatial/utils/queue.hpp"
 
 #include "rmm/cuda_stream_pool.hpp"
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
-#define GPUSPATIAL_PROFILING
 namespace gpuspatial {
 
 /** * @brief A spatial index implementation using NVIDIA OptiX ray tracing 
engine.
@@ -57,10 +55,9 @@ class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
     Queue<index_t> build_indices;
     rmm::device_uvector<index_t> probe_indices{0, rmm::cuda_stream_default};
 #ifdef GPUSPATIAL_PROFILING
-    GPUTimer timer;
     // counters
     double alloc_ms = 0.0;
-    double bvh_build_ms = 0.0;
+    double prepare_ms = 0.0;
     double rt_ms = 0.0;
     double copy_res_ms = 0.0;
 #endif
@@ -92,7 +89,10 @@ class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
   rmm::device_uvector<point_t> points_{0, rmm::cuda_stream_default};
   rmm::device_buffer bvh_buffer_{0, rmm::cuda_stream_default};
   OptixTraversableHandle handle_;
-
+#ifdef GPUSPATIAL_PROFILING
+  double push_build_ms_ = 0.0f;
+  double finish_building_ms_ = 0.0f;
+#endif
   void allocateResultBuffer(SpatialIndexContext& ctx, uint32_t capacity) const;
 
   void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<point_t> points,
diff --git 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
index 9d6d9d37..c89a442f 100644
--- 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
+++ 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
@@ -23,7 +23,6 @@
 #include "gpuspatial/refine/spatial_refiner.hpp"
 #include "gpuspatial/relate/relate_engine.cuh"
 #include "gpuspatial/rt/rt_engine.hpp"
-#include "gpuspatial/utils/gpu_timer.hpp"
 #include "gpuspatial/utils/thread_pool.hpp"
 
 #include "geoarrow/geoarrow_type.h"
@@ -72,10 +71,7 @@ class RTSpatialRefiner : public SpatialRefiner {
   struct SpatialRefinerContext {
     rmm::cuda_stream_view cuda_stream;
 #ifdef GPUSPATIAL_PROFILING
-    GPUTimer timer;
-    // counters
     double parse_ms = 0.0;
-    double alloc_ms = 0.0;
     double refine_ms = 0.0;
     double copy_res_ms = 0.0;
 #endif
@@ -107,7 +103,10 @@ class RTSpatialRefiner : public SpatialRefiner {
   std::shared_ptr<ThreadPool> thread_pool_;
   std::unique_ptr<ParallelWkbLoader<point_t, index_t>> wkb_loader_;
   dev_geometries_t build_geometries_;
-
+#ifdef GPUSPATIAL_PROFILING
+  double push_build_ms_ = 0.0f;
+  double finish_building_ms_ = 0.0f;
+#endif
   template <typename INDEX_IT>
   void buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
                        INDEX_IT index_end, IndicesMap& indices_map) const;
diff --git 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
deleted file mode 100644
index 1cec9359..00000000
--- 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/utils/exception.hpp"
-
-#include <cuda_runtime.h>
-namespace gpuspatial {
-// A simple utility class for timing CUDA kernels.
-class GPUTimer {
- public:
-  // Constructor creates the start and stop events.
-  GPUTimer() {
-    CUDA_CHECK(cudaEventCreate(&start_event));
-    CUDA_CHECK(cudaEventCreate(&stop_event));
-  }
-
-  // Destructor destroys the events.
-  ~GPUTimer() {
-    CUDA_CHECK(cudaEventDestroy(start_event));
-    CUDA_CHECK(cudaEventDestroy(stop_event));
-  }
-
-  // Records the start event in the specified stream.
-  void start(cudaStream_t stream = 0) {
-    CUDA_CHECK(cudaEventRecord(start_event, stream));
-  }
-
-  // Records the stop event and returns the elapsed time in milliseconds.
-  float stop(cudaStream_t stream = 0) {
-    CUDA_CHECK(cudaEventRecord(stop_event, stream));
-    float elapsed_time_ms = 0.0f;
-    // The following call will block the CPU thread until the stop event has 
been
-    // recorded.
-    CUDA_CHECK(cudaEventSynchronize(stop_event));
-    CUDA_CHECK(cudaEventElapsedTime(&elapsed_time_ms, start_event, 
stop_event));
-    return elapsed_time_ms;
-  }
-
- private:
-  cudaEvent_t start_event;
-  cudaEvent_t stop_event;
-};
-}  // namespace gpuspatial
diff --git 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
index 822fa92d..d4bd88d9 100644
--- 
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
+++ 
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
@@ -29,7 +29,10 @@ class Stopwatch {
   }
 
   void start() { t2 = t1 = std::chrono::high_resolution_clock::now(); }
-  void stop() { t2 = std::chrono::high_resolution_clock::now(); }
+  double stop() {
+    t2 = std::chrono::high_resolution_clock::now();
+    return ms();
+  }
 
   double ms() const {
     return std::chrono::duration_cast<std::chrono::microseconds>(t2 - 
t1).count() /
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc 
b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
index fdf66e70..c528d0c3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
@@ -17,7 +17,6 @@
 
 #include "gpuspatial/mem/memory_manager.hpp"
 #include "gpuspatial/utils/logger.hpp"
-
 #if defined(_WIN32)
 #include <windows.h>
 #elif defined(__linux__)
@@ -25,6 +24,9 @@
 #else  // POSIX (BSD, Solaris, etc.)
 #include <unistd.h>
 #endif
+
+#include <mutex>
+
 namespace gpuspatial {
 namespace detail {
 inline long long get_free_physical_memory() {
@@ -60,13 +62,17 @@ inline long long get_free_physical_memory() {
 }  // namespace detail
 
 MemoryManager& MemoryManager::instance() {
-  static MemoryManager instance;
-  return instance;
+  // Use a heap allocation to bypass automatic static destruction.
+  // This prevents the destructor from running after RMM has already been torn 
down.
+  // This is an intentional memory leak.
+  static MemoryManager* instance = new MemoryManager();
+  return *instance;
 }
 
 MemoryManager::~MemoryManager() { Shutdown(); }
 
 void MemoryManager::Shutdown() {
+  GPUSPATIAL_LOG_INFO("Shutdown MemoryManager and releasing all resources.");
   if (is_initialized_) {
     rmm::mr::set_current_device_resource(nullptr);
     active_resource_.reset();
@@ -78,12 +84,15 @@ void MemoryManager::Shutdown() {
 }
 
 void MemoryManager::Init(bool use_pool, int init_pool_precent) {
+  static std::mutex init_mtx;
+  std::lock_guard<std::mutex> lock(init_mtx);
+
   if (is_initialized_) {
     GPUSPATIAL_LOG_WARN(
         "MemoryManager is already initialized. Skipping re-initialization.");
     return;
   }
-
+  GPUSPATIAL_LOG_INFO("Init Memory Manager");
   cuda_mr_ = std::make_unique<CudaMR>();
   use_pool_ = use_pool;
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu 
b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
index db081da2..cc20d51a 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
@@ -25,8 +25,8 @@
 #include "gpuspatial/utils/logger.hpp"
 #include "rt/shaders/shader_id.hpp"
 
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/device/tracking_resource_adaptor.hpp>
+#include "rmm/mr/pool_memory_resource.hpp"
+#include "rmm/mr/tracking_resource_adaptor.hpp"
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_scalar.hpp"
 #include "rmm/exec_policy.hpp"
@@ -34,6 +34,7 @@
 #include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
+#include <cuda/std/iterator>
 
 namespace gpuspatial {
 namespace detail {
@@ -328,7 +329,7 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(const 
rmm::cuda_stream_view& strea
                           auto IM = relate(geom1, geom2);
                           return !detail::EvaluatePredicate(predicate, IM);
                         });
-  size_t new_size = thrust::distance(zip_begin, end);
+  size_t new_size = cuda::std::distance(zip_begin, end);
   ids1.resize(new_size, stream);
   ids2.resize(new_size, stream);
 }
@@ -459,7 +460,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   // Collect uniq polygon ids to estimate total BVH memory usage
   auto uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
                                           uniq_poly_ids.begin(), 
uniq_poly_ids.end());
-  uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), 
uniq_poly_ids_end),
+  uniq_poly_ids.resize(cuda::std::distance(uniq_poly_ids.begin(), 
uniq_poly_ids_end),
                        stream);
   uniq_poly_ids.shrink_to_fit(stream);
 
@@ -488,7 +489,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
                                        uniq_poly_ids.begin(), 
uniq_poly_ids.end());
 
-    uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), 
uniq_poly_ids_end),
+    uniq_poly_ids.resize(cuda::std::distance(uniq_poly_ids.begin(), 
uniq_poly_ids_end),
                          stream);
     uniq_poly_ids.shrink_to_fit(stream);
 
@@ -552,7 +553,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
                                [=] __device__(const thrust::tuple<INDEX_T, 
INDEX_T>& tu) {
                                  return tu == invalid_tuple;
                                });
-  size_t new_size = thrust::distance(zip_begin, end);
+  size_t new_size = cuda::std::distance(zip_begin, end);
   point_ids.resize(new_size, stream);
   poly_ids.resize(new_size, stream);
 }
@@ -601,7 +602,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
       thrust::unique(rmm::exec_policy_nosync(stream), 
uniq_multi_poly_ids.begin(),
                      uniq_multi_poly_ids.end());
   uniq_multi_poly_ids.resize(
-      thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), 
stream);
+      cuda::std::distance(uniq_multi_poly_ids.begin(), 
uniq_multi_poly_ids_end), stream);
   uniq_multi_poly_ids.shrink_to_fit(stream);
 
   auto bvh_bytes =
@@ -632,7 +633,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         thrust::unique(rmm::exec_policy_nosync(stream), 
uniq_multi_poly_ids.begin(),
                        uniq_multi_poly_ids.end());
     uniq_multi_poly_ids.resize(
-        thrust::distance(uniq_multi_poly_ids.begin(), 
uniq_multi_poly_ids_end), stream);
+        cuda::std::distance(uniq_multi_poly_ids.begin(), 
uniq_multi_poly_ids_end), stream);
     uniq_multi_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
@@ -699,7 +700,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
                                [=] __device__(const thrust::tuple<INDEX_T, 
INDEX_T>& tu) {
                                  return tu == invalid_tuple;
                                });
-  size_t new_size = thrust::distance(zip_begin, end);
+  size_t new_size = cuda::std::distance(zip_begin, end);
   point_ids.resize(new_size, stream);
   multi_poly_ids.resize(new_size, stream);
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu 
b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
index 9f76af49..30d3f5e9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
@@ -30,6 +30,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
+#include <cuda/std/iterator>
 
 #define OPTIX_MAX_RAYS (1lu << 30)
 
@@ -42,7 +43,7 @@ static rmm::device_uvector<OptixAabb> 
ComputeAABBs(rmm::cuda_stream_view stream,
   rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
 
   thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
-                    aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) {
+                    aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) -> 
OptixAabb {
                       // handle empty boxes
                       if (mbr.get_min().empty() || mbr.get_max().empty()) {
                         // empty box
@@ -88,7 +89,7 @@ rmm::device_uvector<OptixAabb> ComputeAABBs(
   rmm::device_uvector<uint32_t> morton_codes(np, stream);
   // compute morton codes and reorder indices
   thrust::transform(rmm::exec_policy_nosync(stream), points.begin(), 
points.end(),
-                    morton_codes.begin(), [=] __device__(const POINT_T& p) {
+                    morton_codes.begin(), [=] __device__(const POINT_T& p) -> 
uint32_t {
                       POINT_T norm_p;
 
                       for (int dim = 0; dim < n_dim; dim++) {
@@ -271,6 +272,10 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
   GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), Clear", this,
                       rmm::available_device_memory().first / 1024 / 1024);
   auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+  push_build_ms_ = 0.0;
+  finish_building_ms_ = 0.0;
+#endif
   bvh_buffer_.resize(0, stream);
   bvh_buffer_.shrink_to_fit(stream);
   rects_.resize(0, stream);
@@ -282,36 +287,46 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
 
 template <typename SCALAR_T, int N_DIM>
 void RTSpatialIndex<SCALAR_T, N_DIM>::PushBuild(const box_t* rects, uint32_t 
n_rects) {
-  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, rectangles 
%zu", this,
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, Rectangles 
%zu", this,
                       rmm::available_device_memory().first / 1024 / 1024, 
n_rects);
   if (n_rects == 0) return;
   auto stream = rmm::cuda_stream_default;
   auto prev_size = rects_.size();
 
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
   rects_.resize(rects_.size() + n_rects, stream);
   CUDA_CHECK(cudaMemcpyAsync(rects_.data() + prev_size, rects, sizeof(box_t) * 
n_rects,
                              cudaMemcpyHostToDevice, stream));
+#ifdef GPUSPATIAL_PROFILING
+  stream.synchronize();
+  push_build_ms_ += sw.stop();
+#endif
 }
 
 template <typename SCALAR_T, int N_DIM>
 void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
   auto stream = rmm::cuda_stream_default;
-
-  indexing_points_ = thrust::all_of(rmm::exec_policy_nosync(stream), 
rects_.begin(),
-                                    rects_.end(), [] __device__(const box_t& 
box) {
-                                      bool is_point = true;
-                                      for (int dim = 0; dim < n_dim; dim++) {
-                                        is_point &= box.get_min(dim) == 
box.get_max(dim);
-                                      }
-                                      return is_point;
-                                    });
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
+  indexing_points_ =
+      thrust::all_of(rmm::exec_policy_nosync(stream), rects_.begin(), 
rects_.end(),
+                     [] __device__(const box_t& box) -> bool {
+                       bool is_point = true;
+                       for (int dim = 0; dim < n_dim; dim++) {
+                         is_point &= box.get_min(dim) == box.get_max(dim);
+                       }
+                       return is_point;
+                     });
 
   rmm::device_uvector<OptixAabb> aabbs{0, stream};
   if (indexing_points_) {
     points_.resize(rects_.size(), stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), rects_.begin(), 
rects_.end(),
-                      points_.begin(),
-                      [] __device__(const box_t& box) { return box.get_min(); 
});
+    thrust::transform(
+        rmm::exec_policy_nosync(stream), rects_.begin(), rects_.end(), 
points_.begin(),
+        [] __device__(const box_t& box) -> point_t { return box.get_min(); });
     aabbs = std::move(detail::ComputeAABBs(stream, points_, point_ranges_,
                                            reordered_point_indices_,
                                            config_.n_points_per_aabb, rects_));
@@ -322,11 +337,18 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
   handle_ = config_.rt_engine->BuildAccelCustom(stream, 
ArrayView<OptixAabb>(aabbs),
                                                 bvh_buffer_, 
config_.prefer_fast_build,
                                                 config_.compact);
-
+  stream.synchronize();
   GPUSPATIAL_LOG_INFO(
       "RTSpatialIndex %p (Free %zu MB), FinishBuilding Index on %s, Total 
geoms: %zu",
       this, rmm::available_device_memory().first / 1024 / 1024,
       indexing_points_ ? "Points" : "Rectangles", numGeometries());
+#ifdef GPUSPATIAL_PROFILING
+  finish_building_ms_ = sw.stop();
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Profiling Results. PushBuild: %.2lf 
ms, FinishBuilding: %.2f ms",
+      this, rmm::available_device_memory().first / 1024 / 1024, push_build_ms_,
+      finish_building_ms_);
+#endif
 }
 
 template <typename SCALAR_T, int N_DIM>
@@ -338,6 +360,9 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* 
rects, uint32_t n_rects
   // Tracing. InProceedings of the 30th ACM SIGPLAN Annual Symposium on 
Principles and
   // Practice of Parallel Programming 2025"
   if (n_rects == 0) return;
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
   SpatialIndexContext ctx;
   auto stream = stream_pool_->get_stream();
   rmm::device_uvector<box_t> d_rects(n_rects, stream);
@@ -346,48 +371,42 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* 
rects, uint32_t n_rects
   CUDA_CHECK(cudaMemcpyAsync(d_rects.data(), rects, sizeof(box_t) * n_rects,
                              cudaMemcpyHostToDevice, stream));
 
-  bool probe_points = thrust::all_of(rmm::exec_policy_nosync(stream), 
d_rects.begin(),
-                                     d_rects.end(), [] __device__(const box_t& 
box) {
-                                       bool is_point = true;
-                                       for (int dim = 0; dim < n_dim; dim++) {
-                                         is_point &= box.get_min(dim) == 
box.get_max(dim);
-                                       }
-                                       return is_point;
-                                     });
+  bool probe_points =
+      thrust::all_of(rmm::exec_policy_nosync(stream), d_rects.begin(), 
d_rects.end(),
+                     [] __device__(const box_t& box) -> bool {
+                       bool is_point = true;
+                       for (int dim = 0; dim < n_dim; dim++) {
+                         is_point &= box.get_min(dim) == box.get_max(dim);
+                       }
+                       return is_point;
+                     });
 
   if (probe_points) {
     d_points.resize(d_rects.size(), stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), 
d_rects.end(),
-                      d_points.begin(),
-                      [] __device__(const box_t& box) { return box.get_min(); 
});
+    thrust::transform(
+        rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(), 
d_points.begin(),
+        [] __device__(const box_t& box) -> point_t { return box.get_min(); });
     d_rects.resize(0, stream);
     d_rects.shrink_to_fit(stream);
-
   } else {
     // Build a BVH over the MBRs of the stream geometries
-#ifdef GPUSPATIAL_PROFILING
-    ctx.timer.start(stream);
-#endif
     rmm::device_uvector<OptixAabb> aabbs(n_rects, stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), 
d_rects.end(),
-                      aabbs.begin(),
-                      [] __device__(const box_t& mbr) { return 
mbr.ToOptixAabb(); });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(), 
aabbs.begin(),
+        [] __device__(const box_t& mbr) -> OptixAabb { return 
mbr.ToOptixAabb(); });
     ctx.handle = config_.rt_engine->BuildAccelCustom(
         stream, ArrayView<OptixAabb>(aabbs), ctx.bvh_buffer, 
config_.prefer_fast_build,
         config_.compact);
+  }
 #ifdef GPUSPATIAL_PROFILING
-    ctx.bvh_build_ms = ctx.timer.stop(stream);
+  ctx.stream.synchronize();
+  ctx.prepare_ms = sw.stop();
 #endif
-  }
-
   ctx.counter = std::make_unique<rmm::device_scalar<uint32_t>>(0, stream);
 
   bool swap_ids = false;
 
   auto query = [&](bool counting) {
-#ifdef GPUSPATIAL_PROFILING
-    ctx.timer.start(stream);
-#endif
     if (indexing_points_) {
       if (probe_points) {
         handleBuildPoint(ctx, ArrayView<point_t>(d_points), counting);
@@ -402,9 +421,6 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* 
rects, uint32_t n_rects
         handleBuildBox(ctx, ArrayView<box_t>(d_rects), counting);
       }
     }
-#ifdef GPUSPATIAL_PROFILING
-    ctx.rt_ms += ctx.timer.stop(stream);
-#endif
   };
 
   // first pass: counting
@@ -430,7 +446,6 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* 
rects, uint32_t n_rects
   }
 
 #ifdef GPUSPATIAL_PROFILING
-  Stopwatch sw;
   sw.start();
 #endif
   build_indices->resize(result_size);
@@ -443,15 +458,20 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* 
rects, uint32_t n_rects
                              sizeof(index_t) * result_size, 
cudaMemcpyDeviceToHost,
                              stream));
   stream.synchronize();
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu", 
this,
+      rmm::available_device_memory().first / 1024 / 1024,
+      probe_points ? "Points" : "Rectangles",
+      probe_points ? d_points.size() : d_rects.size(), build_indices->size());
+
 #ifdef GPUSPATIAL_PROFILING
   sw.stop();
   ctx.copy_res_ms = sw.ms();
   GPUSPATIAL_LOG_INFO(
-      "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu, 
Alloc: %.2f ms, BVH Build: %.2f ms, RT: %.2f ms, Copy res: %.2f ms",
-      this, rmm::available_device_memory().first / 1024 / 1024,
-      probe_points ? "Points" : "Rectangles",
-      probe_points ? d_points.size() : d_rects.size(), build_indices->size(),
-      ctx.alloc_ms, ctx.bvh_build_ms, ctx.rt_ms, ctx.copy_res_ms);
+      "RTSpatialIndex %p (Free %zu MB), Profiling Results. Alloc: %.2lf ms, 
Prepare: %.2lf ms, RT: %.2f ms, Copy res: %.2lf ms",
+      this, rmm::available_device_memory().first / 1024 / 1024, ctx.alloc_ms,
+      ctx.prepare_ms, ctx.rt_ms, ctx.copy_res_ms);
 #endif
 }
 
@@ -596,9 +616,8 @@ template <typename SCALAR_T, int N_DIM>
 void RTSpatialIndex<SCALAR_T, 
N_DIM>::allocateResultBuffer(SpatialIndexContext& ctx,
                                                            uint32_t capacity) 
const {
 #ifdef GPUSPATIAL_PROFILING
-  ctx.timer.start(ctx.stream);
+  Stopwatch sw(true);
 #endif
-
   GPUSPATIAL_LOG_INFO(
       "RTSpatialIndex %p (Free %zu MB), Allocate result buffer, memory 
consumption %zu MB, capacity %u",
       this, rmm::available_device_memory().first / 1024 / 1024,
@@ -607,7 +626,8 @@ void RTSpatialIndex<SCALAR_T, 
N_DIM>::allocateResultBuffer(SpatialIndexContext&
   ctx.build_indices.Init(ctx.stream, capacity);
   ctx.probe_indices.resize(capacity, ctx.stream);
 #ifdef GPUSPATIAL_PROFILING
-  ctx.alloc_ms += ctx.timer.stop(ctx.stream);
+  ctx.stream.synchronize();
+  ctx.alloc_ms += sw.stop();
 #endif
 }
 
@@ -649,7 +669,7 @@ template <typename SCALAR_T, int N_DIM>
 void RTSpatialIndex<SCALAR_T, N_DIM>::filter(SpatialIndexContext& ctx,
                                              uint32_t dim_x) const {
 #ifdef GPUSPATIAL_PROFILING
-  ctx.timer.start(ctx.stream);
+  Stopwatch sw(true);
 #endif
   if (dim_x > 0) {
     config_.rt_engine->Render(ctx.stream, ctx.shader_id, dim3{dim_x, 1, 1},
@@ -657,7 +677,8 @@ void RTSpatialIndex<SCALAR_T, 
N_DIM>::filter(SpatialIndexContext& ctx,
                                               
ctx.launch_params_buffer.size()));
   }
 #ifdef GPUSPATIAL_PROFILING
-  ctx.rt_ms += ctx.timer.stop(ctx.stream);
+  ctx.stream.synchronize();
+  ctx.rt_ms += sw.stop();
 #endif
 }
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu 
b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
index 22e6748e..47324304 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
@@ -47,10 +47,10 @@ void ReorderIndices(rmm::cuda_stream_view stream, INDEX_IT 
index_begin,
   auto sorted_begin = sorted_uniq_indices.begin();
   auto sorted_end = sorted_uniq_indices.end();
   thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end,
-                    reordered_indices.begin(), [=] __device__(uint32_t val) {
+                    reordered_indices.begin(), [=] __device__(uint32_t val) -> 
uint32_t {
                       auto it =
                           thrust::lower_bound(thrust::seq, sorted_begin, 
sorted_end, val);
-                      return thrust::distance(sorted_begin, it);
+                      return cuda::std::distance(sorted_begin, it);
                     });
 }
 
@@ -91,19 +91,44 @@ RTSpatialRefiner::RTSpatialRefiner(const 
RTSpatialRefinerConfig& config)
 
 void RTSpatialRefiner::Clear() {
   auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+  push_build_ms_ = 0.0;
+  finish_building_ms_ = 0.0;
+#endif
   wkb_loader_->Clear(stream);
   build_geometries_.Clear(stream);
 }
 
 void RTSpatialRefiner::PushBuild(const ArrowArrayView* build_array) {
   auto stream = rmm::cuda_stream_default;
-
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
   wkb_loader_->Parse(stream, build_array, 0, build_array->length);
+#ifdef GPUSPATIAL_PROFILING
+  stream.synchronize();
+  push_build_ms_ += sw.stop();
+#endif
 }
 
 void RTSpatialRefiner::FinishBuilding() {
   auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
   build_geometries_ = std::move(wkb_loader_->Finish(stream));
+  stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+  finish_building_ms_ = sw.stop();
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Profiling Results. PushBuild: %.2lf 
ms, FinishBuilding: %.2lf ms",
+      this, rmm::available_device_memory().first / 1024 / 1024, push_build_ms_,
+      finish_building_ms_);
+#endif
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), FinishBuilding Refiner, Total geoms: 
%zu", this,
+      rmm::available_device_memory().first / 1024 / 1024,
+      build_geometries_.num_features());
 }
 
 uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* probe_array, Predicate 
predicate,
@@ -117,6 +142,9 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* 
probe_array, Predicate p
     return RefinePipelined(probe_array, predicate, build_indices, 
probe_indices, len);
   }
 
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw(true);
+#endif
   SpatialRefinerContext ctx;
   ctx.cuda_stream = stream_pool_->get_stream();
 
@@ -129,6 +157,7 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* 
probe_array, Predicate p
 
   buildIndicesMap(ctx.cuda_stream, d_probe_indices.begin(), 
d_probe_indices.end(),
                   probe_indices_map);
+  ctx.cuda_stream.synchronize();  // Ensure h_uniq_indices is ready before 
parsing
 
   loader_t loader(thread_pool_);
   loader_t::Config loader_config;
@@ -139,12 +168,20 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* 
probe_array, Predicate p
                probe_indices_map.h_uniq_indices.end());
   auto probe_geoms = std::move(loader.Finish(ctx.cuda_stream));
 
+#ifdef GPUSPATIAL_PROFILING
+  ctx.cuda_stream.synchronize();
+  ctx.parse_ms = sw.stop();
+#endif
+
   GPUSPATIAL_LOG_INFO(
       "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, ProbeArray %ld, 
Loaded %u, Type %s",
       this, rmm::available_device_memory().first / 1024 / 1024, 
probe_array->length,
       probe_geoms.num_features(),
       GeometryTypeToString(probe_geoms.get_geometry_type()).c_str());
 
+#ifdef GPUSPATIAL_PROFILING
+  sw.start();
+#endif
   RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
                                                config_.rt_engine.get());
   RelateEngine<point_t, index_t>::Config re_config;
@@ -165,20 +202,14 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* 
probe_array, Predicate p
       this, rmm::available_device_memory().first / 1024 / 1024, len,
       PredicateToString(predicate));
 
-#ifdef GPUSPATIAL_PROFILING
-  ctx.timer.start(ctx.cuda_stream);
-#endif
-
   relate_engine.Evaluate(ctx.cuda_stream, probe_geoms, predicate, 
d_build_indices,
                          probe_indices_map.d_reordered_indices);
   auto new_size = d_build_indices.size();
 #ifdef GPUSPATIAL_PROFILING
-  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
-  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new 
size %zu",
-                      this, rmm::available_device_memory().first / 1024 / 
1024, refine_ms,
-                      new_size);
+  ctx.cuda_stream.synchronize();
+  ctx.refine_ms = sw.stop();
+  sw.start();
 #endif
-
   d_probe_indices.resize(new_size, ctx.cuda_stream);
 
   thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
@@ -199,6 +230,14 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* 
probe_array, Predicate p
                              sizeof(uint32_t) * new_size, 
cudaMemcpyDeviceToHost,
                              ctx.cuda_stream));
   ctx.cuda_stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+  ctx.copy_res_ms = sw.stop();
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Profiling Results. Parse: %.2lf ms, 
Refine: %.2lf ms, Copy Results: %.2lf ms",
+      this, rmm::available_device_memory().first / 1024 / 1024, len, 
ctx.parse_ms,
+      ctx.refine_ms, ctx.copy_res_ms);
+#endif
   return new_size;
 }
 
@@ -236,7 +275,7 @@ uint32_t RTSpatialRefiner::RefinePipelined(const 
ArrowArrayView* probe_array,
   size_t batch_size = (len + n_batches - 1) / n_batches;
 
   GPUSPATIAL_LOG_INFO(
-      "RTSpatialRefiner %p, pipeline refinement, total len %u, batches %d, 
batch size %zu",
+      "RTSpatialRefiner %p, Pipeline Refinement. Total Len %u, Batches %d, 
Batch Size %zu",
       this, len, n_batches, batch_size);
 
   // Resource allocation for slots
@@ -280,6 +319,8 @@ uint32_t RTSpatialRefiner::RefinePipelined(const 
ArrowArrayView* probe_array,
 
     // 4. Parse WKB (CPU Heavy)
     slot->loader->Clear(slot->stream);
+    slot->stream.synchronize();  // Ensure h_uniq_indices is ready!
+
     slot->loader->Parse(slot->stream, probe_array,
                         slot->indices_map.h_uniq_indices.begin(),
                         slot->indices_map.h_uniq_indices.end());
@@ -288,14 +329,14 @@ uint32_t RTSpatialRefiner::RefinePipelined(const 
ArrowArrayView* probe_array,
     return slot->loader->Finish(slot->stream);
   };
 
+  main_stream.synchronize();  // Ensure allocation is done before main loop
+
   // --- PIPELINE PRIMING ---
   // Start processing Batch 0 immediately in background
   size_t first_batch_len = std::min(batch_size, (size_t)len);
   slots[0]->prep_future = std::async(std::launch::async, prepare_batch_task,
                                      slots[0].get(), 0, first_batch_len);
 
-  main_stream.synchronize();  // Ensure allocation is done before main loop
-
   // --- MAIN PIPELINE LOOP ---
   for (size_t offset = 0; offset < len; offset += batch_size) {
     int curr_idx = (offset / batch_size) % NUM_SLOTS;
@@ -327,7 +368,7 @@ uint32_t RTSpatialRefiner::RefinePipelined(const 
ArrowArrayView* probe_array,
     curr_slot->d_batch_build_indices.resize(current_batch_len, 
curr_slot->stream);
     CUDA_CHECK(cudaMemcpyAsync(curr_slot->d_batch_build_indices.data(), 
batch_build_ptr,
                                sizeof(uint32_t) * current_batch_len,
-                               cudaMemcpyHostToDevice, curr_slot->stream));
+                               cudaMemcpyDeviceToDevice, curr_slot->stream));
 
     // Relate/Refine
     // Note: Evaluate filters d_batch_build_indices in-place
@@ -398,7 +439,7 @@ template <typename INDEX_IT>
 void RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT 
index_begin,
                                        INDEX_IT index_end,
                                        IndicesMap& indices_map) const {
-  auto len = thrust::distance(index_begin, index_end);
+  auto len = cuda::std::distance(index_begin, index_end);
   auto& d_uniq_indices = indices_map.d_uniq_indices;
   auto& h_uniq_indices = indices_map.h_uniq_indices;
 
@@ -410,7 +451,7 @@ void 
RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT in
                d_uniq_indices.end());
   auto uniq_end = thrust::unique(rmm::exec_policy_nosync(stream), 
d_uniq_indices.begin(),
                                  d_uniq_indices.end());
-  auto uniq_size = thrust::distance(d_uniq_indices.begin(), uniq_end);
+  auto uniq_size = cuda::std::distance(d_uniq_indices.begin(), uniq_end);
 
   d_uniq_indices.resize(uniq_size, stream);
   h_uniq_indices.resize(uniq_size);


Reply via email to