edponce commented on a change in pull request #11882:
URL: https://github.com/apache/arrow/pull/11882#discussion_r776597556



##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -1240,5 +1255,22 @@ ARROW_EXPORT Result<Datum> AssumeTimezone(const Datum& 
values,
                                           AssumeTimezoneOptions options,
                                           ExecContext* ctx = NULLPTR);
 
+/// \brief Between compares each element in `values`
+/// with `left` as a lower bound and 'right' as an upperbound

Review comment:
       `upperbound` --> `upper bound`

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -1240,5 +1255,22 @@ ARROW_EXPORT Result<Datum> AssumeTimezone(const Datum& 
values,
                                           AssumeTimezoneOptions options,
                                           ExecContext* ctx = NULLPTR);
 
+/// \brief Between compares each element in `values`
+/// with `left` as a lower bound and 'right' as an upperbound
+///
+/// \param[in] values input to compare between left and right
+/// \param[in] left used as the lower bound for comparison
+/// \param[in] right used as the upper bound for comparison
+//  \param[in] options between options, optional
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Between(const Datum& values, const Datum& left, const Datum& 
right,
+                      BetweenOptions options = BetweenOptions(),

Review comment:
       `BetweenOptions()` --> `BetweenOptions::Defaults()`

##########
File path: cpp/src/arrow/compute/api_scalar.cc
##########
@@ -506,6 +537,12 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
     return CallFunction(REGISTRY_NAME, {left, right}, ctx);                    
 \
   }
 
+#define SCALAR_EAGER_TERNARY(NAME, REGISTRY_NAME)                              
 \
+  Result<Datum> NAME(const Datum& value, const Datum& left, const Datum& 
right, \
+                     ExecContext* ctx) {                                       
 \
+    return CallFunction(REGISTRY_NAME, {value, left, right}, ctx);             
 \
+  }
+

Review comment:
       Remove this macro as it is not used and at the moment there are no other 
ternary functions for a general macro to be too useful.

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -316,6 +316,21 @@ struct ARROW_EXPORT CompareOptions {
   enum CompareOperator op;
 };
 
+enum class BetweenMode : int8_t {
+  LESS_EQUAL_LESS_EQUAL,
+  LESS_EQUAL_LESS,
+  LESS_LESS_EQUAL,
+  LESS_LESS,
+};

Review comment:
       I suggest to place this `enum` inside `BetweenOptions` as it is not 
necessary to have it in global space. Also rename to `Operator`, so that it 
would be invoked as `BetweenOptions::Operator::LESS_LESS` or 
`BetweenOptions::LESS_LESS`.
   
   I recognize there are some inconsistencies wrt to the visibility and 
scopeness of enums, and it is out-of-scope for this PR to discuss such details. 
I will ask to get feedback on whether we should revise the conventions for 
`FunctionOptions`.

##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -786,6 +932,34 @@ const FunctionDoc max_element_wise_doc{
     {"*args"},
     "ElementWiseAggregateOptions"};
 
+const FunctionDoc between_doc{"Check if values are in a range, val betwen a 
and b",

Review comment:
       Remove `between_doc` because there is no `CallFunction("between", ...)`.

##########
File path: cpp/src/arrow/util/bit_block_counter.h
##########
@@ -87,6 +87,147 @@ struct BitBlockOrNot<bool> {
   static bool Call(bool left, bool right) { return left || !right; }
 };
 
+// Three Arguments
+template <typename T>
+struct BitBlockAndAnd {
+  static T Call(T left, T mid, T right) { return left & mid & right; }
+};
+
+template <>
+struct BitBlockAndAnd<bool> {
+  static bool Call(bool left, bool mid, bool right) { return left && mid && 
right; }

Review comment:
       There is no need to specialize all these bitwise operations for `bool` 
as the language guarantees that for boolean types: `(bool)(a & b) == a && b`. 
Note the explicit cast which corresponds to the return type `T` in these 
functions. This simplification also applies to the existing binary bitwise 
operations, so we can resolve this in a follow-up PR.

##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,111 @@ std::shared_ptr<ScalarFunction> 
MakeScalarMinMax(std::string name,
   return func;
 }
 
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+                            ScalarTernaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+}
+
+template <typename OutType, typename ArgType, typename Op>
+struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType, 
Op> {
+  using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& var = checked_cast<const TimestampType&>(*batch[0].type());
+    const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type());
+    const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type());
+    if ((var.timezone().empty() != lhs.timezone().empty()) ||
+        (var.timezone().empty() != rhs.timezone().empty()) ||
+        (lhs.timezone().empty() != rhs.timezone().empty())) {
+      return Status::Invalid(
+          "Cannot use timestamps with timezone and timestamps without 
timezones, got: ",
+          var.timezone().empty(), " ", lhs.timezone().empty(), " and ",

Review comment:
       I would remove printing the values of `XXX.empty()` as they will output 
boolean values which are not that meaningful with the current wording. I 
suggest: `Cannot compare timestamps with and without timezones`.

##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,111 @@ std::shared_ptr<ScalarFunction> 
MakeScalarMinMax(std::string name,
   return func;
 }
 
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+                            ScalarTernaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+}
+
+template <typename OutType, typename ArgType, typename Op>
+struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType, 
Op> {
+  using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& var = checked_cast<const TimestampType&>(*batch[0].type());
+    const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type());
+    const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type());
+    if ((var.timezone().empty() != lhs.timezone().empty()) ||
+        (var.timezone().empty() != rhs.timezone().empty()) ||
+        (lhs.timezone().empty() != rhs.timezone().empty())) {
+      return Status::Invalid(
+          "Cannot use timestamps with timezone and timestamps without 
timezones, got: ",
+          var.timezone().empty(), " ", lhs.timezone().empty(), " and ",
+          rhs.timezone().empty());
+    }
+    return Base::Exec(ctx, batch, out);
+  }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBetweenFunction(std::string name,
+                                                    const FunctionDoc* doc) {
+  auto func = std::make_shared<CompareFunction>(name, Arity::Ternary(), doc);
+
+  DCHECK_OK(func->AddKernel(
+      {boolean(), boolean(), boolean()}, boolean(),
+      ScalarTernary<BooleanType, BooleanType, BooleanType, BooleanType, 
Op>::Exec));
+
+  for (const std::shared_ptr<DataType>& ty : IntTypes()) {
+    AddIntegerBetween<Op>(ty, func.get());
+  }
+
+  AddIntegerBetween<Op>(date32(), func.get());
+  AddIntegerBetween<Op>(date64(), func.get());
+
+  AddGenericBetween<FloatType, Op>(float32(), func.get());
+  AddGenericBetween<DoubleType, Op>(float64(), func.get());
+
+  // Add timestamp kernels
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::TimestampTypeUnit(unit));
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
+                              BetweenTimestamps<BooleanType, TimestampType, 
Op>::Exec));
+  }
+
+  // Duration
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::DurationTypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  // Time32 and Time64
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
+    InputType in_type(match::Time32TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int32());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+  for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
+    InputType in_type(match::Time64TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryBase<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);

Review comment:
       For consistency, do not dereference `ty`.

##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,111 @@ std::shared_ptr<ScalarFunction> 
MakeScalarMinMax(std::string name,
   return func;
 }
 
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+                            ScalarTernaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+}

Review comment:
       Maybe inline `AddIntegerBetween` and `AddGenericBetween`, and remove 
these functions.
   There are already several explicit `GeneratePhysicalInteger` in 
`MakeBetweenFunction()`.
   You can use local lambdas too.

##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,111 @@ std::shared_ptr<ScalarFunction> 
MakeScalarMinMax(std::string name,
   return func;
 }
 
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+                            ScalarTernaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+}
+
+template <typename OutType, typename ArgType, typename Op>
+struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType, 
Op> {
+  using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& var = checked_cast<const TimestampType&>(*batch[0].type());
+    const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type());
+    const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type());
+    if ((var.timezone().empty() != lhs.timezone().empty()) ||
+        (var.timezone().empty() != rhs.timezone().empty()) ||
+        (lhs.timezone().empty() != rhs.timezone().empty())) {
+      return Status::Invalid(
+          "Cannot use timestamps with timezone and timestamps without 
timezones, got: ",
+          var.timezone().empty(), " ", lhs.timezone().empty(), " and ",
+          rhs.timezone().empty());
+    }
+    return Base::Exec(ctx, batch, out);
+  }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBetweenFunction(std::string name,
+                                                    const FunctionDoc* doc) {
+  auto func = std::make_shared<CompareFunction>(name, Arity::Ternary(), doc);
+
+  DCHECK_OK(func->AddKernel(
+      {boolean(), boolean(), boolean()}, boolean(),
+      ScalarTernary<BooleanType, BooleanType, BooleanType, BooleanType, 
Op>::Exec));
+
+  for (const std::shared_ptr<DataType>& ty : IntTypes()) {
+    AddIntegerBetween<Op>(ty, func.get());
+  }
+
+  AddIntegerBetween<Op>(date32(), func.get());
+  AddIntegerBetween<Op>(date64(), func.get());
+
+  AddGenericBetween<FloatType, Op>(float32(), func.get());
+  AddGenericBetween<DoubleType, Op>(float64(), func.get());
+
+  // Add timestamp kernels
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::TimestampTypeUnit(unit));
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
+                              BetweenTimestamps<BooleanType, TimestampType, 
Op>::Exec));
+  }
+
+  // Duration
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::DurationTypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  // Time32 and Time64
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
+    InputType in_type(match::Time32TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int32());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+  for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
+    InputType in_type(match::Time64TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryBase<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+    DCHECK_OK(func->AddKernel(
+        {InputType::Array(ty), InputType::Scalar(ty), InputType::Scalar(ty)}, 
boolean(),
+        exec));
+    DCHECK_OK(func->AddKernel(
+        {InputType::Array(ty), InputType::Array(ty), InputType::Array(ty)}, 
boolean(),
+        std::move(exec)));
+  }

Review comment:
       I am confused here. Why only `BaseBinaryTypes` have 
`Array-Scalar-Scalar` and `Array-Array-Array`, and others types do not? Also, 
what about mixed shapes `XXX-Scalar-Array` and `XXX-Array-Scalar?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to