bkmgit commented on a change in pull request #11882:
URL: https://github.com/apache/arrow/pull/11882#discussion_r776936853
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,155 @@ std::shared_ptr<ScalarFunction>
MakeScalarMinMax(std::string name,
return func;
}
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction*
func) {
+ auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType,
Op>(*ty);
+ DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction*
func) {
+ DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+ ScalarTernaryEqualTypes<BooleanType, InType,
Op>::Exec));
+}
+
+template <typename OutType, typename ArgType, typename Op>
+struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType,
Op> {
+ using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& var = checked_cast<const TimestampType&>(*batch[0].type());
+ const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type());
+ const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type());
+ if ((var.timezone().empty() != lhs.timezone().empty()) ||
+ (var.timezone().empty() != rhs.timezone().empty()) ||
+ (lhs.timezone().empty() != rhs.timezone().empty())) {
+ return Status::Invalid("Cannot compare timestamps with and without
timezones.");
+ }
+ return Base::Exec(ctx, batch, out);
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBetweenFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<CompareFunction>(name, Arity::Ternary(), doc);
+
+ DCHECK_OK(func->AddKernel(
+ {boolean(), boolean(), boolean()}, boolean(),
+ ScalarTernary<BooleanType, BooleanType, BooleanType, BooleanType,
Op>::Exec));
+
+ for (const std::shared_ptr<DataType>& ty : IntTypes()) {
+ AddIntegerBetween<Op>(ty, func.get());
+ }
+
+ AddIntegerBetween<Op>(date32(), func.get());
+ AddIntegerBetween<Op>(date64(), func.get());
+
+ AddGenericBetween<FloatType, Op>(float32(), func.get());
+ AddGenericBetween<DoubleType, Op>(float64(), func.get());
+
+ // Add timestamp kernels
+ for (auto unit : TimeUnit::values()) {
+ InputType in_type(match::TimestampTypeUnit(unit));
+ DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
+ BetweenTimestamps<BooleanType, TimestampType,
Op>::Exec));
+ }
+
+ // Duration
+ for (auto unit : TimeUnit::values()) {
+ InputType in_type(match::DurationTypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType,
Op>(int64());
+ DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
std::move(exec)));
+ }
+
+ // Time32 and Time64
+ for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
+ InputType in_type(match::Time32TypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType,
Op>(int32());
+ DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
std::move(exec)));
+ }
+ for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
+ InputType in_type(match::Time64TypeUnit(unit));
+ auto exec =
+ GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType,
Op>(int64());
+ DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
std::move(exec)));
+ }
+
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ auto exec = GenerateVarBinaryBase<ScalarTernaryEqualTypes, BooleanType,
Op>(*ty);
+ DCHECK_OK(func->AddKernel(
+ {InputType::Array(ty), InputType::Scalar(ty), InputType::Scalar(ty)},
boolean(),
+ exec));
+ DCHECK_OK(func->AddKernel(
+ {InputType::Array(ty), InputType::Array(ty), InputType::Array(ty)},
boolean(),
+ std::move(exec)));
+ }
+
+ for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) {
+ auto exec = GenerateDecimal<ScalarTernaryEqualTypes, BooleanType, Op>(id);
+ DCHECK_OK(func->AddKernel({InputType(id), InputType(id), InputType(id)},
boolean(),
+ std::move(exec)));
+ }
+
+ {
+ auto exec = ScalarTernaryEqualTypes<BooleanType, FixedSizeBinaryType,
Op>::Exec;
+ auto ty = InputType(Type::FIXED_SIZE_BINARY);
+ DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+ }
+
+ return func;
+}
+
+const BetweenOptions* GetDefaultBetweenOptions() {
+ static const auto kBetweenOptions = BetweenOptions::Defaults();
+ return &kBetweenOptions;
+}
+
+const FunctionDoc between_doc{"Check if values are in a range, val betwen a
and b",
+ ("A null on either side emits a null comparison
result.\n"
+ "options are used to specify if the endpoints
are\n"
+ "inclusive, possible values are NEITHER
(a<val<b),\n"
+ "LEFT (a<=val<b), RIGHT (a<val<=b), and the
default\n"
+ "if not specified BOTH (a<=val<=b)"),
+ {"val", "a", "b"}};
+
+class BetweenMetaFunction : public MetaFunction {
+ public:
+ BetweenMetaFunction()
+ : MetaFunction("between", Arity::Ternary(), &between_doc,
+ GetDefaultBetweenOptions()) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ const BetweenOptions& between_options = static_cast<const
BetweenOptions&>(*options);
+ Datum result;
Review comment:
Thanks for your feedback. Ok, on moving the functions to within the
dispatch for between. Underlying APIs should support natural integration into
other programming languages and not force Pandas/Python approach, but also be
easy to maintain and at least consistent for related functions. Would it make
sense to then also use one compare function, but have functions on top of this
for `greater`, `less` etc.? The `not_between` function could then be
implemented by calling `between` and reversing the result.
I understand that Pandas has lot of influence. In addition to databases,
there are other libraries such as
- [Ruby Daru](https://github.com/ankane/rover)
- [R Dplyr](https://dplyr.tidyverse.org/reference/between.html)
- [Hive Between](https://sqlandhadoop.com/hive-between/)
- [Ruby Rover](https://github.com/ankane/rover)
- [Ruby CSV
table](https://ruby-doc.org/stdlib-3.1.0/libdoc/csv/rdoc/CSV/Table.html)
- [Link
compare](https://docs.microsoft.com/en-us/dotnet/visual-basic/language-reference/operators/comparison-operators)
- [Apache Calcite Algebra](https://calcite.apache.org/docs/algebra.html)
For many of these, chained operations is the way to implement between. It is
unclear if one can write a Kernel generator that would minimize loading data
from memory for chained operations - Calcite approach seems useful, but need to
understand it better. Will ask on mailing list for thoughts on these kind of
approaches since they would be something new.
Finally, I have not done much performance measurement and optimization of
these kernels, but this can probably be done once tests and relevant
functionality are working.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]