This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 346ab2ca feat(c/sedona-libgpuspatial): Fix synchronization bugs and
improve logging (#721)
346ab2ca is described below
commit 346ab2cacd85f5d007fba68d77af2faaa2728c76
Author: Liang Geng <[email protected]>
AuthorDate: Wed Mar 18 12:15:13 2026 +0800
feat(c/sedona-libgpuspatial): Fix synchronization bugs and improve logging
(#721)
---
.../libgpuspatial/CMakeLists.txt | 5 +
.../include/gpuspatial/geom/polygon.hpp | 1 +
.../include/gpuspatial/index/rt_spatial_index.cuh | 10 +-
.../gpuspatial/refine/rt_spatial_refiner.cuh | 9 +-
.../include/gpuspatial/utils/gpu_timer.hpp | 57 ---------
.../include/gpuspatial/utils/stopwatch.hpp | 5 +-
.../libgpuspatial/src/memory_manager.cc | 17 ++-
.../libgpuspatial/src/relate_engine.cu | 19 +--
.../libgpuspatial/src/rt_spatial_index.cu | 129 ++++++++++++---------
.../libgpuspatial/src/rt_spatial_refiner.cu | 77 +++++++++---
10 files changed, 176 insertions(+), 153 deletions(-)
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
index eab27248..65e554c3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
@@ -17,6 +17,7 @@
cmake_minimum_required(VERSION 3.30.4)
option(GPUSPATIAL_BUILD_TESTS "Build tests" OFF)
+option(GPUSPATIAL_PROFILING "Enable profiling" OFF)
# This must be set before project() to be picked up by vcpkg
if(GPUSPATIAL_BUILD_TESTS)
@@ -163,6 +164,10 @@ target_compile_options(gpuspatial
-DGPUSPATIAL_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBGPUSPATIAL_LOGGING_LEVEL}
)
+if(GPUSPATIAL_PROFILING)
+ target_compile_definitions(gpuspatial PUBLIC GPUSPATIAL_PROFILING)
+endif()
+
add_library(gpuspatial_c src/gpuspatial_c.cc)
target_link_libraries(gpuspatial_c PUBLIC gpuspatial)
diff --git
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
index e457a8fb..076ced2c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
@@ -26,6 +26,7 @@
#include <cub/warp/warp_reduce.cuh>
#include <thrust/binary_search.h>
+#include <thrust/distance.h>
namespace gpuspatial {
diff --git
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
index baaeb77f..aa993c86 100644
---
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
+++
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
@@ -19,13 +19,11 @@
#include "gpuspatial/index/rt_spatial_index.hpp"
#include "gpuspatial/index/spatial_index.hpp"
#include "gpuspatial/rt/rt_engine.hpp"
-#include "gpuspatial/utils/gpu_timer.hpp"
#include "gpuspatial/utils/queue.hpp"
#include "rmm/cuda_stream_pool.hpp"
#include "rmm/cuda_stream_view.hpp"
#include "rmm/device_uvector.hpp"
-#define GPUSPATIAL_PROFILING
namespace gpuspatial {
/** * @brief A spatial index implementation using NVIDIA OptiX ray tracing
engine.
@@ -57,10 +55,9 @@ class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
Queue<index_t> build_indices;
rmm::device_uvector<index_t> probe_indices{0, rmm::cuda_stream_default};
#ifdef GPUSPATIAL_PROFILING
- GPUTimer timer;
// counters
double alloc_ms = 0.0;
- double bvh_build_ms = 0.0;
+ double prepare_ms = 0.0;
double rt_ms = 0.0;
double copy_res_ms = 0.0;
#endif
@@ -92,7 +89,10 @@ class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
rmm::device_uvector<point_t> points_{0, rmm::cuda_stream_default};
rmm::device_buffer bvh_buffer_{0, rmm::cuda_stream_default};
OptixTraversableHandle handle_;
-
+#ifdef GPUSPATIAL_PROFILING
+ double push_build_ms_ = 0.0f;
+ double finish_building_ms_ = 0.0f;
+#endif
void allocateResultBuffer(SpatialIndexContext& ctx, uint32_t capacity) const;
void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<point_t> points,
diff --git
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
index 9d6d9d37..c89a442f 100644
---
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
+++
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
@@ -23,7 +23,6 @@
#include "gpuspatial/refine/spatial_refiner.hpp"
#include "gpuspatial/relate/relate_engine.cuh"
#include "gpuspatial/rt/rt_engine.hpp"
-#include "gpuspatial/utils/gpu_timer.hpp"
#include "gpuspatial/utils/thread_pool.hpp"
#include "geoarrow/geoarrow_type.h"
@@ -72,10 +71,7 @@ class RTSpatialRefiner : public SpatialRefiner {
struct SpatialRefinerContext {
rmm::cuda_stream_view cuda_stream;
#ifdef GPUSPATIAL_PROFILING
- GPUTimer timer;
- // counters
double parse_ms = 0.0;
- double alloc_ms = 0.0;
double refine_ms = 0.0;
double copy_res_ms = 0.0;
#endif
@@ -107,7 +103,10 @@ class RTSpatialRefiner : public SpatialRefiner {
std::shared_ptr<ThreadPool> thread_pool_;
std::unique_ptr<ParallelWkbLoader<point_t, index_t>> wkb_loader_;
dev_geometries_t build_geometries_;
-
+#ifdef GPUSPATIAL_PROFILING
+ double push_build_ms_ = 0.0f;
+ double finish_building_ms_ = 0.0f;
+#endif
template <typename INDEX_IT>
void buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
INDEX_IT index_end, IndicesMap& indices_map) const;
diff --git
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
deleted file mode 100644
index 1cec9359..00000000
---
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/utils/exception.hpp"
-
-#include <cuda_runtime.h>
-namespace gpuspatial {
-// A simple utility class for timing CUDA kernels.
-class GPUTimer {
- public:
- // Constructor creates the start and stop events.
- GPUTimer() {
- CUDA_CHECK(cudaEventCreate(&start_event));
- CUDA_CHECK(cudaEventCreate(&stop_event));
- }
-
- // Destructor destroys the events.
- ~GPUTimer() {
- CUDA_CHECK(cudaEventDestroy(start_event));
- CUDA_CHECK(cudaEventDestroy(stop_event));
- }
-
- // Records the start event in the specified stream.
- void start(cudaStream_t stream = 0) {
- CUDA_CHECK(cudaEventRecord(start_event, stream));
- }
-
- // Records the stop event and returns the elapsed time in milliseconds.
- float stop(cudaStream_t stream = 0) {
- CUDA_CHECK(cudaEventRecord(stop_event, stream));
- float elapsed_time_ms = 0.0f;
- // The following call will block the CPU thread until the stop event has
been
- // recorded.
- CUDA_CHECK(cudaEventSynchronize(stop_event));
- CUDA_CHECK(cudaEventElapsedTime(&elapsed_time_ms, start_event,
stop_event));
- return elapsed_time_ms;
- }
-
- private:
- cudaEvent_t start_event;
- cudaEvent_t stop_event;
-};
-} // namespace gpuspatial
diff --git
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
index 822fa92d..d4bd88d9 100644
---
a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
+++
b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
@@ -29,7 +29,10 @@ class Stopwatch {
}
void start() { t2 = t1 = std::chrono::high_resolution_clock::now(); }
- void stop() { t2 = std::chrono::high_resolution_clock::now(); }
+ double stop() {
+ t2 = std::chrono::high_resolution_clock::now();
+ return ms();
+ }
double ms() const {
return std::chrono::duration_cast<std::chrono::microseconds>(t2 -
t1).count() /
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
index fdf66e70..c528d0c3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
@@ -17,7 +17,6 @@
#include "gpuspatial/mem/memory_manager.hpp"
#include "gpuspatial/utils/logger.hpp"
-
#if defined(_WIN32)
#include <windows.h>
#elif defined(__linux__)
@@ -25,6 +24,9 @@
#else // POSIX (BSD, Solaris, etc.)
#include <unistd.h>
#endif
+
+#include <mutex>
+
namespace gpuspatial {
namespace detail {
inline long long get_free_physical_memory() {
@@ -60,13 +62,17 @@ inline long long get_free_physical_memory() {
} // namespace detail
MemoryManager& MemoryManager::instance() {
- static MemoryManager instance;
- return instance;
+ // Use a heap allocation to bypass automatic static destruction.
+ // This prevents the destructor from running after RMM has already been torn
down.
+ // This is an intentional memory leak.
+ static MemoryManager* instance = new MemoryManager();
+ return *instance;
}
MemoryManager::~MemoryManager() { Shutdown(); }
void MemoryManager::Shutdown() {
+ GPUSPATIAL_LOG_INFO("Shutdown MemoryManager and releasing all resources.");
if (is_initialized_) {
rmm::mr::set_current_device_resource(nullptr);
active_resource_.reset();
@@ -78,12 +84,15 @@ void MemoryManager::Shutdown() {
}
void MemoryManager::Init(bool use_pool, int init_pool_precent) {
+ static std::mutex init_mtx;
+ std::lock_guard<std::mutex> lock(init_mtx);
+
if (is_initialized_) {
GPUSPATIAL_LOG_WARN(
"MemoryManager is already initialized. Skipping re-initialization.");
return;
}
-
+ GPUSPATIAL_LOG_INFO("Init Memory Manager");
cuda_mr_ = std::make_unique<CudaMR>();
use_pool_ = use_pool;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
index db081da2..cc20d51a 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
@@ -25,8 +25,8 @@
#include "gpuspatial/utils/logger.hpp"
#include "rt/shaders/shader_id.hpp"
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/device/tracking_resource_adaptor.hpp>
+#include "rmm/mr/pool_memory_resource.hpp"
+#include "rmm/mr/tracking_resource_adaptor.hpp"
#include "rmm/cuda_stream_view.hpp"
#include "rmm/device_scalar.hpp"
#include "rmm/exec_policy.hpp"
@@ -34,6 +34,7 @@
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
+#include <cuda/std/iterator>
namespace gpuspatial {
namespace detail {
@@ -328,7 +329,7 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(const
rmm::cuda_stream_view& strea
auto IM = relate(geom1, geom2);
return !detail::EvaluatePredicate(predicate, IM);
});
- size_t new_size = thrust::distance(zip_begin, end);
+ size_t new_size = cuda::std::distance(zip_begin, end);
ids1.resize(new_size, stream);
ids2.resize(new_size, stream);
}
@@ -459,7 +460,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
// Collect uniq polygon ids to estimate total BVH memory usage
auto uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
uniq_poly_ids.begin(),
uniq_poly_ids.end());
- uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(),
uniq_poly_ids_end),
+ uniq_poly_ids.resize(cuda::std::distance(uniq_poly_ids.begin(),
uniq_poly_ids_end),
stream);
uniq_poly_ids.shrink_to_fit(stream);
@@ -488,7 +489,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
uniq_poly_ids.begin(),
uniq_poly_ids.end());
- uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(),
uniq_poly_ids_end),
+ uniq_poly_ids.resize(cuda::std::distance(uniq_poly_ids.begin(),
uniq_poly_ids_end),
stream);
uniq_poly_ids.shrink_to_fit(stream);
@@ -552,7 +553,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
[=] __device__(const thrust::tuple<INDEX_T,
INDEX_T>& tu) {
return tu == invalid_tuple;
});
- size_t new_size = thrust::distance(zip_begin, end);
+ size_t new_size = cuda::std::distance(zip_begin, end);
point_ids.resize(new_size, stream);
poly_ids.resize(new_size, stream);
}
@@ -601,7 +602,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
thrust::unique(rmm::exec_policy_nosync(stream),
uniq_multi_poly_ids.begin(),
uniq_multi_poly_ids.end());
uniq_multi_poly_ids.resize(
- thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end),
stream);
+ cuda::std::distance(uniq_multi_poly_ids.begin(),
uniq_multi_poly_ids_end), stream);
uniq_multi_poly_ids.shrink_to_fit(stream);
auto bvh_bytes =
@@ -632,7 +633,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
thrust::unique(rmm::exec_policy_nosync(stream),
uniq_multi_poly_ids.begin(),
uniq_multi_poly_ids.end());
uniq_multi_poly_ids.resize(
- thrust::distance(uniq_multi_poly_ids.begin(),
uniq_multi_poly_ids_end), stream);
+ cuda::std::distance(uniq_multi_poly_ids.begin(),
uniq_multi_poly_ids_end), stream);
uniq_multi_poly_ids.shrink_to_fit(stream);
rmm::device_uvector<int> IMs(ids_size_batch, stream);
@@ -699,7 +700,7 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
[=] __device__(const thrust::tuple<INDEX_T,
INDEX_T>& tu) {
return tu == invalid_tuple;
});
- size_t new_size = thrust::distance(zip_begin, end);
+ size_t new_size = cuda::std::distance(zip_begin, end);
point_ids.resize(new_size, stream);
multi_poly_ids.resize(new_size, stream);
}
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
index 9f76af49..30d3f5e9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
@@ -30,6 +30,7 @@
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
+#include <cuda/std/iterator>
#define OPTIX_MAX_RAYS (1lu << 30)
@@ -42,7 +43,7 @@ static rmm::device_uvector<OptixAabb>
ComputeAABBs(rmm::cuda_stream_view stream,
rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
- aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) {
+ aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) ->
OptixAabb {
// handle empty boxes
if (mbr.get_min().empty() || mbr.get_max().empty()) {
// empty box
@@ -88,7 +89,7 @@ rmm::device_uvector<OptixAabb> ComputeAABBs(
rmm::device_uvector<uint32_t> morton_codes(np, stream);
// compute morton codes and reorder indices
thrust::transform(rmm::exec_policy_nosync(stream), points.begin(),
points.end(),
- morton_codes.begin(), [=] __device__(const POINT_T& p) {
+ morton_codes.begin(), [=] __device__(const POINT_T& p) ->
uint32_t {
POINT_T norm_p;
for (int dim = 0; dim < n_dim; dim++) {
@@ -271,6 +272,10 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), Clear", this,
rmm::available_device_memory().first / 1024 / 1024);
auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+ push_build_ms_ = 0.0;
+ finish_building_ms_ = 0.0;
+#endif
bvh_buffer_.resize(0, stream);
bvh_buffer_.shrink_to_fit(stream);
rects_.resize(0, stream);
@@ -282,36 +287,46 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
template <typename SCALAR_T, int N_DIM>
void RTSpatialIndex<SCALAR_T, N_DIM>::PushBuild(const box_t* rects, uint32_t
n_rects) {
- GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, rectangles
%zu", this,
+ GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, Rectangles
%zu", this,
rmm::available_device_memory().first / 1024 / 1024,
n_rects);
if (n_rects == 0) return;
auto stream = rmm::cuda_stream_default;
auto prev_size = rects_.size();
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
rects_.resize(rects_.size() + n_rects, stream);
CUDA_CHECK(cudaMemcpyAsync(rects_.data() + prev_size, rects, sizeof(box_t) *
n_rects,
cudaMemcpyHostToDevice, stream));
+#ifdef GPUSPATIAL_PROFILING
+ stream.synchronize();
+ push_build_ms_ += sw.stop();
+#endif
}
template <typename SCALAR_T, int N_DIM>
void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
auto stream = rmm::cuda_stream_default;
-
- indexing_points_ = thrust::all_of(rmm::exec_policy_nosync(stream),
rects_.begin(),
- rects_.end(), [] __device__(const box_t&
box) {
- bool is_point = true;
- for (int dim = 0; dim < n_dim; dim++) {
- is_point &= box.get_min(dim) ==
box.get_max(dim);
- }
- return is_point;
- });
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
+ indexing_points_ =
+ thrust::all_of(rmm::exec_policy_nosync(stream), rects_.begin(),
rects_.end(),
+ [] __device__(const box_t& box) -> bool {
+ bool is_point = true;
+ for (int dim = 0; dim < n_dim; dim++) {
+ is_point &= box.get_min(dim) == box.get_max(dim);
+ }
+ return is_point;
+ });
rmm::device_uvector<OptixAabb> aabbs{0, stream};
if (indexing_points_) {
points_.resize(rects_.size(), stream);
- thrust::transform(rmm::exec_policy_nosync(stream), rects_.begin(),
rects_.end(),
- points_.begin(),
- [] __device__(const box_t& box) { return box.get_min();
});
+ thrust::transform(
+ rmm::exec_policy_nosync(stream), rects_.begin(), rects_.end(),
points_.begin(),
+ [] __device__(const box_t& box) -> point_t { return box.get_min(); });
aabbs = std::move(detail::ComputeAABBs(stream, points_, point_ranges_,
reordered_point_indices_,
config_.n_points_per_aabb, rects_));
@@ -322,11 +337,18 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
handle_ = config_.rt_engine->BuildAccelCustom(stream,
ArrayView<OptixAabb>(aabbs),
bvh_buffer_,
config_.prefer_fast_build,
config_.compact);
-
+ stream.synchronize();
GPUSPATIAL_LOG_INFO(
"RTSpatialIndex %p (Free %zu MB), FinishBuilding Index on %s, Total
geoms: %zu",
this, rmm::available_device_memory().first / 1024 / 1024,
indexing_points_ ? "Points" : "Rectangles", numGeometries());
+#ifdef GPUSPATIAL_PROFILING
+ finish_building_ms_ = sw.stop();
+ GPUSPATIAL_LOG_INFO(
+ "RTSpatialIndex %p (Free %zu MB), Profiling Results. PushBuild: %.2lf
ms, FinishBuilding: %.2f ms",
+ this, rmm::available_device_memory().first / 1024 / 1024, push_build_ms_,
+ finish_building_ms_);
+#endif
}
template <typename SCALAR_T, int N_DIM>
@@ -338,6 +360,9 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t*
rects, uint32_t n_rects
// Tracing. InProceedings of the 30th ACM SIGPLAN Annual Symposium on
Principles and
// Practice of Parallel Programming 2025"
if (n_rects == 0) return;
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
SpatialIndexContext ctx;
auto stream = stream_pool_->get_stream();
rmm::device_uvector<box_t> d_rects(n_rects, stream);
@@ -346,48 +371,42 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t*
rects, uint32_t n_rects
CUDA_CHECK(cudaMemcpyAsync(d_rects.data(), rects, sizeof(box_t) * n_rects,
cudaMemcpyHostToDevice, stream));
- bool probe_points = thrust::all_of(rmm::exec_policy_nosync(stream),
d_rects.begin(),
- d_rects.end(), [] __device__(const box_t&
box) {
- bool is_point = true;
- for (int dim = 0; dim < n_dim; dim++) {
- is_point &= box.get_min(dim) ==
box.get_max(dim);
- }
- return is_point;
- });
+ bool probe_points =
+ thrust::all_of(rmm::exec_policy_nosync(stream), d_rects.begin(),
d_rects.end(),
+ [] __device__(const box_t& box) -> bool {
+ bool is_point = true;
+ for (int dim = 0; dim < n_dim; dim++) {
+ is_point &= box.get_min(dim) == box.get_max(dim);
+ }
+ return is_point;
+ });
if (probe_points) {
d_points.resize(d_rects.size(), stream);
- thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(),
d_rects.end(),
- d_points.begin(),
- [] __device__(const box_t& box) { return box.get_min();
});
+ thrust::transform(
+ rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
d_points.begin(),
+ [] __device__(const box_t& box) -> point_t { return box.get_min(); });
d_rects.resize(0, stream);
d_rects.shrink_to_fit(stream);
-
} else {
// Build a BVH over the MBRs of the stream geometries
-#ifdef GPUSPATIAL_PROFILING
- ctx.timer.start(stream);
-#endif
rmm::device_uvector<OptixAabb> aabbs(n_rects, stream);
- thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(),
d_rects.end(),
- aabbs.begin(),
- [] __device__(const box_t& mbr) { return
mbr.ToOptixAabb(); });
+ thrust::transform(
+ rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
aabbs.begin(),
+ [] __device__(const box_t& mbr) -> OptixAabb { return
mbr.ToOptixAabb(); });
ctx.handle = config_.rt_engine->BuildAccelCustom(
stream, ArrayView<OptixAabb>(aabbs), ctx.bvh_buffer,
config_.prefer_fast_build,
config_.compact);
+ }
#ifdef GPUSPATIAL_PROFILING
- ctx.bvh_build_ms = ctx.timer.stop(stream);
+ ctx.stream.synchronize();
+ ctx.prepare_ms = sw.stop();
#endif
- }
-
ctx.counter = std::make_unique<rmm::device_scalar<uint32_t>>(0, stream);
bool swap_ids = false;
auto query = [&](bool counting) {
-#ifdef GPUSPATIAL_PROFILING
- ctx.timer.start(stream);
-#endif
if (indexing_points_) {
if (probe_points) {
handleBuildPoint(ctx, ArrayView<point_t>(d_points), counting);
@@ -402,9 +421,6 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t*
rects, uint32_t n_rects
handleBuildBox(ctx, ArrayView<box_t>(d_rects), counting);
}
}
-#ifdef GPUSPATIAL_PROFILING
- ctx.rt_ms += ctx.timer.stop(stream);
-#endif
};
// first pass: counting
@@ -430,7 +446,6 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t*
rects, uint32_t n_rects
}
#ifdef GPUSPATIAL_PROFILING
- Stopwatch sw;
sw.start();
#endif
build_indices->resize(result_size);
@@ -443,15 +458,20 @@ void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t*
rects, uint32_t n_rects
sizeof(index_t) * result_size,
cudaMemcpyDeviceToHost,
stream));
stream.synchronize();
+
+ GPUSPATIAL_LOG_INFO(
+ "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu",
this,
+ rmm::available_device_memory().first / 1024 / 1024,
+ probe_points ? "Points" : "Rectangles",
+ probe_points ? d_points.size() : d_rects.size(), build_indices->size());
+
#ifdef GPUSPATIAL_PROFILING
sw.stop();
ctx.copy_res_ms = sw.ms();
GPUSPATIAL_LOG_INFO(
- "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu,
Alloc: %.2f ms, BVH Build: %.2f ms, RT: %.2f ms, Copy res: %.2f ms",
- this, rmm::available_device_memory().first / 1024 / 1024,
- probe_points ? "Points" : "Rectangles",
- probe_points ? d_points.size() : d_rects.size(), build_indices->size(),
- ctx.alloc_ms, ctx.bvh_build_ms, ctx.rt_ms, ctx.copy_res_ms);
+ "RTSpatialIndex %p (Free %zu MB), Profiling Results. Alloc: %.2lf ms,
Prepare: %.2lf ms, RT: %.2f ms, Copy res: %.2lf ms",
+ this, rmm::available_device_memory().first / 1024 / 1024, ctx.alloc_ms,
+ ctx.prepare_ms, ctx.rt_ms, ctx.copy_res_ms);
#endif
}
@@ -596,9 +616,8 @@ template <typename SCALAR_T, int N_DIM>
void RTSpatialIndex<SCALAR_T,
N_DIM>::allocateResultBuffer(SpatialIndexContext& ctx,
uint32_t capacity)
const {
#ifdef GPUSPATIAL_PROFILING
- ctx.timer.start(ctx.stream);
+ Stopwatch sw(true);
#endif
-
GPUSPATIAL_LOG_INFO(
"RTSpatialIndex %p (Free %zu MB), Allocate result buffer, memory
consumption %zu MB, capacity %u",
this, rmm::available_device_memory().first / 1024 / 1024,
@@ -607,7 +626,8 @@ void RTSpatialIndex<SCALAR_T,
N_DIM>::allocateResultBuffer(SpatialIndexContext&
ctx.build_indices.Init(ctx.stream, capacity);
ctx.probe_indices.resize(capacity, ctx.stream);
#ifdef GPUSPATIAL_PROFILING
- ctx.alloc_ms += ctx.timer.stop(ctx.stream);
+ ctx.stream.synchronize();
+ ctx.alloc_ms += sw.stop();
#endif
}
@@ -649,7 +669,7 @@ template <typename SCALAR_T, int N_DIM>
void RTSpatialIndex<SCALAR_T, N_DIM>::filter(SpatialIndexContext& ctx,
uint32_t dim_x) const {
#ifdef GPUSPATIAL_PROFILING
- ctx.timer.start(ctx.stream);
+ Stopwatch sw(true);
#endif
if (dim_x > 0) {
config_.rt_engine->Render(ctx.stream, ctx.shader_id, dim3{dim_x, 1, 1},
@@ -657,7 +677,8 @@ void RTSpatialIndex<SCALAR_T,
N_DIM>::filter(SpatialIndexContext& ctx,
ctx.launch_params_buffer.size()));
}
#ifdef GPUSPATIAL_PROFILING
- ctx.rt_ms += ctx.timer.stop(ctx.stream);
+ ctx.stream.synchronize();
+ ctx.rt_ms += sw.stop();
#endif
}
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
index 22e6748e..47324304 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
@@ -47,10 +47,10 @@ void ReorderIndices(rmm::cuda_stream_view stream, INDEX_IT
index_begin,
auto sorted_begin = sorted_uniq_indices.begin();
auto sorted_end = sorted_uniq_indices.end();
thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end,
- reordered_indices.begin(), [=] __device__(uint32_t val) {
+ reordered_indices.begin(), [=] __device__(uint32_t val) ->
uint32_t {
auto it =
thrust::lower_bound(thrust::seq, sorted_begin,
sorted_end, val);
- return thrust::distance(sorted_begin, it);
+ return cuda::std::distance(sorted_begin, it);
});
}
@@ -91,19 +91,44 @@ RTSpatialRefiner::RTSpatialRefiner(const
RTSpatialRefinerConfig& config)
void RTSpatialRefiner::Clear() {
auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+ push_build_ms_ = 0.0;
+ finish_building_ms_ = 0.0;
+#endif
wkb_loader_->Clear(stream);
build_geometries_.Clear(stream);
}
void RTSpatialRefiner::PushBuild(const ArrowArrayView* build_array) {
auto stream = rmm::cuda_stream_default;
-
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
wkb_loader_->Parse(stream, build_array, 0, build_array->length);
+#ifdef GPUSPATIAL_PROFILING
+ stream.synchronize();
+ push_build_ms_ += sw.stop();
+#endif
}
void RTSpatialRefiner::FinishBuilding() {
auto stream = rmm::cuda_stream_default;
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
build_geometries_ = std::move(wkb_loader_->Finish(stream));
+ stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+ finish_building_ms_ = sw.stop();
+ GPUSPATIAL_LOG_INFO(
+ "RTSpatialRefiner %p (Free %zu MB), Profiling Results. PushBuild: %.2lf
ms, FinishBuilding: %.2lf ms",
+ this, rmm::available_device_memory().first / 1024 / 1024, push_build_ms_,
+ finish_building_ms_);
+#endif
+ GPUSPATIAL_LOG_INFO(
+ "RTSpatialRefiner %p (Free %zu MB), FinishBuilding Refiner, Total geoms:
%zu", this,
+ rmm::available_device_memory().first / 1024 / 1024,
+ build_geometries_.num_features());
}
uint32_t RTSpatialRefiner::Refine(const ArrowArrayView* probe_array, Predicate
predicate,
@@ -117,6 +142,9 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView*
probe_array, Predicate p
return RefinePipelined(probe_array, predicate, build_indices,
probe_indices, len);
}
+#ifdef GPUSPATIAL_PROFILING
+ Stopwatch sw(true);
+#endif
SpatialRefinerContext ctx;
ctx.cuda_stream = stream_pool_->get_stream();
@@ -129,6 +157,7 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView*
probe_array, Predicate p
buildIndicesMap(ctx.cuda_stream, d_probe_indices.begin(),
d_probe_indices.end(),
probe_indices_map);
+ ctx.cuda_stream.synchronize(); // Ensure h_uniq_indices is ready before
parsing
loader_t loader(thread_pool_);
loader_t::Config loader_config;
@@ -139,12 +168,20 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView*
probe_array, Predicate p
probe_indices_map.h_uniq_indices.end());
auto probe_geoms = std::move(loader.Finish(ctx.cuda_stream));
+#ifdef GPUSPATIAL_PROFILING
+ ctx.cuda_stream.synchronize();
+ ctx.parse_ms = sw.stop();
+#endif
+
GPUSPATIAL_LOG_INFO(
"RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, ProbeArray %ld,
Loaded %u, Type %s",
this, rmm::available_device_memory().first / 1024 / 1024,
probe_array->length,
probe_geoms.num_features(),
GeometryTypeToString(probe_geoms.get_geometry_type()).c_str());
+#ifdef GPUSPATIAL_PROFILING
+ sw.start();
+#endif
RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
config_.rt_engine.get());
RelateEngine<point_t, index_t>::Config re_config;
@@ -165,20 +202,14 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView*
probe_array, Predicate p
this, rmm::available_device_memory().first / 1024 / 1024, len,
PredicateToString(predicate));
-#ifdef GPUSPATIAL_PROFILING
- ctx.timer.start(ctx.cuda_stream);
-#endif
-
relate_engine.Evaluate(ctx.cuda_stream, probe_geoms, predicate,
d_build_indices,
probe_indices_map.d_reordered_indices);
auto new_size = d_build_indices.size();
#ifdef GPUSPATIAL_PROFILING
- float refine_ms = ctx.timer.stop(ctx.cuda_stream);
- GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new
size %zu",
- this, rmm::available_device_memory().first / 1024 /
1024, refine_ms,
- new_size);
+ ctx.cuda_stream.synchronize();
+ ctx.refine_ms = sw.stop();
+ sw.start();
#endif
-
d_probe_indices.resize(new_size, ctx.cuda_stream);
thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
@@ -199,6 +230,14 @@ uint32_t RTSpatialRefiner::Refine(const ArrowArrayView*
probe_array, Predicate p
sizeof(uint32_t) * new_size,
cudaMemcpyDeviceToHost,
ctx.cuda_stream));
ctx.cuda_stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+ ctx.copy_res_ms = sw.stop();
+
+ GPUSPATIAL_LOG_INFO(
+ "RTSpatialRefiner %p (Free %zu MB), Profiling Results. Parse: %.2lf ms,
Refine: %.2lf ms, Copy Results: %.2lf ms",
+ this, rmm::available_device_memory().first / 1024 / 1024, len,
ctx.parse_ms,
+ ctx.refine_ms, ctx.copy_res_ms);
+#endif
return new_size;
}
@@ -236,7 +275,7 @@ uint32_t RTSpatialRefiner::RefinePipelined(const
ArrowArrayView* probe_array,
size_t batch_size = (len + n_batches - 1) / n_batches;
GPUSPATIAL_LOG_INFO(
- "RTSpatialRefiner %p, pipeline refinement, total len %u, batches %d,
batch size %zu",
+ "RTSpatialRefiner %p, Pipeline Refinement. Total Len %u, Batches %d,
Batch Size %zu",
this, len, n_batches, batch_size);
// Resource allocation for slots
@@ -280,6 +319,8 @@ uint32_t RTSpatialRefiner::RefinePipelined(const
ArrowArrayView* probe_array,
// 4. Parse WKB (CPU Heavy)
slot->loader->Clear(slot->stream);
+ slot->stream.synchronize(); // Ensure h_uniq_indices is ready!
+
slot->loader->Parse(slot->stream, probe_array,
slot->indices_map.h_uniq_indices.begin(),
slot->indices_map.h_uniq_indices.end());
@@ -288,14 +329,14 @@ uint32_t RTSpatialRefiner::RefinePipelined(const
ArrowArrayView* probe_array,
return slot->loader->Finish(slot->stream);
};
+ main_stream.synchronize(); // Ensure allocation is done before main loop
+
// --- PIPELINE PRIMING ---
// Start processing Batch 0 immediately in background
size_t first_batch_len = std::min(batch_size, (size_t)len);
slots[0]->prep_future = std::async(std::launch::async, prepare_batch_task,
slots[0].get(), 0, first_batch_len);
- main_stream.synchronize(); // Ensure allocation is done before main loop
-
// --- MAIN PIPELINE LOOP ---
for (size_t offset = 0; offset < len; offset += batch_size) {
int curr_idx = (offset / batch_size) % NUM_SLOTS;
@@ -327,7 +368,7 @@ uint32_t RTSpatialRefiner::RefinePipelined(const
ArrowArrayView* probe_array,
curr_slot->d_batch_build_indices.resize(current_batch_len,
curr_slot->stream);
CUDA_CHECK(cudaMemcpyAsync(curr_slot->d_batch_build_indices.data(),
batch_build_ptr,
sizeof(uint32_t) * current_batch_len,
- cudaMemcpyHostToDevice, curr_slot->stream));
+ cudaMemcpyDeviceToDevice, curr_slot->stream));
// Relate/Refine
// Note: Evaluate filters d_batch_build_indices in-place
@@ -398,7 +439,7 @@ template <typename INDEX_IT>
void RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT
index_begin,
INDEX_IT index_end,
IndicesMap& indices_map) const {
- auto len = thrust::distance(index_begin, index_end);
+ auto len = cuda::std::distance(index_begin, index_end);
auto& d_uniq_indices = indices_map.d_uniq_indices;
auto& h_uniq_indices = indices_map.h_uniq_indices;
@@ -410,7 +451,7 @@ void
RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT in
d_uniq_indices.end());
auto uniq_end = thrust::unique(rmm::exec_policy_nosync(stream),
d_uniq_indices.begin(),
d_uniq_indices.end());
- auto uniq_size = thrust::distance(d_uniq_indices.begin(), uniq_end);
+ auto uniq_size = cuda::std::distance(d_uniq_indices.begin(), uniq_end);
d_uniq_indices.resize(uniq_size, stream);
h_uniq_indices.resize(uniq_size);