Alex-PLACET commented on code in PR #49679:
URL: https://github.com/apache/arrow/pull/49679#discussion_r3498899248


##########
cpp/src/arrow/compute/kernels/vector_search_sorted.cc:
##########
@@ -0,0 +1,1183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_vector.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <ranges>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array/array_primitive.h"
+#include "arrow/array/array_run_end.h"
+#include "arrow/array/concatenate.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/chunk_resolver.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/vector_sort_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/registry_internal.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging_internal.h"
+#include "arrow/util/ree_util.h"
+#include "arrow/util/unreachable.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute::internal {
+namespace {
+
+const SearchSortedOptions* GetDefaultSearchSortedOptions() {
+  static const auto kDefaultSearchSortedOptions = 
SearchSortedOptions::Defaults();
+  return &kDefaultSearchSortedOptions;
+}
+
+const FunctionDoc search_sorted_doc(
+    "Find insertion indices for sorted input",
+    ("Return the index where each needle should be inserted in a sorted input 
array\n"
+     "to maintain ascending order.\n"
+     "\n"
+     "With side='left', returns the first suitable index (lower bound).\n"
+     "With side='right', returns the last suitable index (upper bound).\n"
+     "\n"
+     "The searched values may be provided as an array or chunked array and 
must\n"
+     "already be sorted in ascending order. Null values in the searched array 
are\n"
+     "supported when clustered entirely at the start or\n"
+     "entirely at the end. Non-null needles are matched only against the 
non-null\n"
+     "portion of the searched array. Needles may be a scalar, array, or 
chunked\n"
+     "array. Null needles emit nulls in the output."),
+    {"values", "needles"}, "SearchSortedOptions");
+
+// This file implements search_sorted as a small pipeline that first normalizes
+// Arrow input shapes and then runs one typed binary-search core on logical
+// values.
+//
+// Plain arrays, run-end encoded arrays, chunked arrays, and scalar needles are
+// all adapted into a common accessor and run-visitor model so the search logic
+// does not care about physical layout.
+//
+// After validation, the kernel isolates the contiguous non-null window of the
+// searched values, because nulls are only supported when clustered at one end.
+// That window uses logical null counting for run-end encoded inputs, whose
+// nulls live in the values child rather than in a top-level validity bitmap.
+//
+// Needles then follow one of two paths. Scalars and plain arrays go through a
+// shared logical-run visitor: scalars become a single run, plain arrays become
+// one-element runs, and chunked inputs recurse chunk by chunk. Run-end encoded
+// needles take a simpler physical-run path: search each physical needle once,
+// rebuild a temporary run-end encoded uint64 result with the same run ends,
+// and run-end decode it back to the dense output shape.
+//
+// Output materialization is unified behind a typed-buffer builder with an
+// optional validity bitmap. Non-null-only needles only build the uint64 values
+// buffer, while nullable needles also emit a null bitmap.
+//
+// High-level flow:
+//
+//   values datum
+//       |
+//       +--> ValidateSortedValuesInput
+//       |
+//       +--> LogicalType / FindNonNullValuesRange
+//       |
+//       +--> VisitValuesAccessor
+//             |
+//             +--> PlainArrayAccessor
+//             |
+//             +--> RunEndEncodedValuesAccessor
+//             |
+//             +--> ChunkedArrayAccessor
+//             |
+//             `--> ChunkedRunEndEncodedValuesAccessor
+//
+//   needles datum
+//       |
+//       +--> ValidateNeedleInput
+//       |
+//       +--> DatumHasNulls
+//       |
+//       +--> REE needles
+//       |     +--> search physical runs once
+//       |     +--> rebuild temporary REE uint64 result
+//       |     `--> RunEndDecode back to dense output
+//       |
+//       `--> VisitNeedleRuns
+//             |
+//             +--> scalar needle  -> one logical run
+//             |
+//             +--> plain array    -> one-element runs
+//             |
+//             `--> chunked input  -> recurse chunk by chunk
+//
+//   normalized values accessor + normalized needle runs
+//       |
+//       `--> FindInsertionPoint<T>
+//             |
+//             +--> side = left  -> lower_bound semantics
+//             |
+//             `--> side = right -> upper_bound semantics
+//
+//   result materialization
+//       |
+//       +--> no needle nulls
+//       |     `--> InsertionIndexBuilder<false>
+//       |           `--> fill uint64 buffer directly
+//       |
+//       `--> nullable needles
+//             `--> InsertionIndexBuilder<true>
+//                   +--> AppendNulls for null runs
+//                   `--> bulk fill repeated indices and validity bits
+//
+// A rough map of the file:
+//
+//   [validation + type helpers]
+//           |
+//   [value accessors]
+//           |
+//   [needle visitors]
+//           |
+//   [typed search + output helpers]
+//           |
+//   [meta-function dispatch]
+//
+
+#define VISIT_SEARCH_SORTED_PHYSICAL_TYPES(VISIT) \
+  VISIT(BooleanType)                              \
+  VISIT(Int8Type)                                 \
+  VISIT(Int16Type)                                \
+  VISIT(Int32Type)                                \
+  VISIT(Int64Type)                                \
+  VISIT(UInt8Type)                                \
+  VISIT(UInt16Type)                               \
+  VISIT(UInt32Type)                               \
+  VISIT(UInt64Type)                               \
+  VISIT(FloatType)                                \
+  VISIT(DoubleType)                               \
+  VISIT(BinaryType)                               \
+  VISIT(LargeBinaryType)                          \
+  VISIT(BinaryViewType)
+
+template <typename ArrowType>
+using SearchValue = typename GetViewType<ArrowType>::T;
+
+struct NonNullValuesRange {
+  int64_t offset = 0;
+  int64_t length = 0;
+
+  /// Return whether the range spans the full searched values input.
+  bool is_identity(int64_t full_length) const {
+    return (offset == 0) && (length == full_length);
+  }
+};
+
+// Convert ArrayData to its physical representation so that typed accessors
+// can be constructed with a physical ArrowType (e.g. Date32 → Int32).
+// For REE arrays, only the values child type is converted; the REE wrapper
+// type stays unchanged.
+inline std::shared_ptr<ArrayData> ToPhysicalData(
+    const std::shared_ptr<ArrayData>& data,
+    const std::shared_ptr<DataType>& physical_type) {
+  if (data->type->id() == Type::RUN_END_ENCODED) {
+    auto result = data->Copy();
+    auto values_copy = result->child_data[1]->Copy();
+    values_copy->type = physical_type;
+    result->child_data[1] = std::move(values_copy);
+    return result;
+  }
+  auto result = data->Copy();
+  result->type = physical_type;
+  return result;
+}
+
+inline int64_t GetRunEndValue(const ArraySpan& run_ends, int64_t 
physical_index) {
+  switch (run_ends.type->id()) {
+    case Type::INT16:
+      return run_ends.GetValues<int16_t>(1)[physical_index];
+    case Type::INT32:
+      return run_ends.GetValues<int32_t>(1)[physical_index];
+    case Type::INT64:
+      return run_ends.GetValues<int64_t>(1)[physical_index];
+    default:
+      DCHECK(false) << "Unexpected run-end type for search_sorted values: "
+                    << run_ends.type->ToString();
+      return 0;
+  }
+}
+
+/// Comparator implementing Arrow's ascending-order semantics for supported 
types.
+template <typename ArrowType>
+struct SearchSortedCompare {
+  using ValueType = SearchValue<ArrowType>;
+
+  int operator()(const ValueType& left, const ValueType& right) const {
+    return CompareTypeValues<ArrowType>(left, right, SortOrder::Ascending,
+                                        NullPlacement::AtEnd);
+  }
+};
+
+/// Access logical values from a plain Arrow array.
+template <typename ArrowType>
+class PlainArrayAccessor {
+ public:
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ValueType = SearchValue<ArrowType>;
+
+  /// Build a typed accessor over a plain array payload.
+  explicit PlainArrayAccessor(const std::shared_ptr<ArrayData>& array_data)
+      : array_(array_data) {}
+
+  /// Return the logical length of the searched values.
+  int64_t length() const { return array_.length(); }
+
+  /// Return the logical value at the given logical position.
+  ValueType Value(int64_t index) const {
+    return GetViewType<ArrowType>::LogicalValue(array_.GetView(index));
+  }
+
+  uint64_t LogicalInsertionIndex(int64_t index) const {
+    return static_cast<uint64_t>(index);
+  }
+
+ private:
+  ArrayType array_;
+};
+
+/// Access logical values from a run-end encoded Arrow array.
+template <typename ArrowType>
+class RunEndEncodedValuesAccessor {
+ public:
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ValueType = SearchValue<ArrowType>;
+
+  /// Build a typed accessor over a run-end encoded payload.
+  explicit RunEndEncodedValuesAccessor(const RunEndEncodedArray& array)
+      : array_(array),
+        values_(array.values()->data()),
+        array_span_(*array.data()),
+        physical_range_(::arrow::ree_util::FindPhysicalRange(array_span_, 
array.offset(),
+                                                             array.length())) 
{}
+
+  /// Return the number of physical runs used as the search domain.
+  int64_t length() const { return physical_range_.second; }
+
+  /// Return the logical value at the given physical run position.
+  ValueType Value(int64_t index) const {
+    const auto physical_index = physical_range_.first + index;
+    return 
GetViewType<ArrowType>::LogicalValue(values_.GetView(physical_index));
+  }
+
+  int64_t LeadingNullRunCount() const {
+    int64_t null_run_count = 0;
+    for (int64_t index = 0; index < physical_range_.second; ++index) {
+      if (!values_.IsNull(physical_range_.first + index)) {
+        break;
+      }
+      ++null_run_count;
+    }
+    return null_run_count;
+  }
+
+  int64_t TrailingNullRunCount() const {
+    int64_t null_run_count = 0;
+    for (int64_t index = physical_range_.second; index > 0; --index) {
+      if (!values_.IsNull(physical_range_.first + index - 1)) {
+        break;
+      }
+      ++null_run_count;
+    }
+    return null_run_count;
+  }
+

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to