bkietz commented on code in PR #43256:
URL: https://github.com/apache/arrow/pull/43256#discussion_r1722187527


##########
cpp/src/arrow/compute/expression.cc:
##########
@@ -1148,6 +1150,71 @@ Result<Expression> Canonicalize(Expression expr, 
compute::ExecContext* exec_cont
 
 namespace {
 
+/// Context for expression simplification.
+struct SimplificationContext {
+  /// Mapping from `is_in` calls to simplified value sets.
+  ///
+  /// `is_in` predicates with large value sets can be expensive to bind, so we
+  /// accumulate simplifications in the context and defer binding until all
+  /// inequalities have been processed.
+  std::unordered_map<const Expression::Call*, std::shared_ptr<Array>> 
is_in_value_sets;
+};
+
+/// Preprocess an `is_in` predicate value set for simplification.
+/// \return the value set sorted with duplicate and null values removed
+Result<std::shared_ptr<Array>> PrepareIsInValueSet(std::shared_ptr<Array> 
value_set) {
+  ARROW_ASSIGN_OR_RAISE(value_set, Unique(std::move(value_set)));
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Array> sort_indices,
+      SortIndices(value_set, SortOptions({}, NullPlacement::AtEnd)));
+  ARROW_ASSIGN_OR_RAISE(
+      value_set,
+      Take(*value_set, *sort_indices, TakeOptions(/*bounds_check=*/false)));
+  if (value_set->length() > 0 && value_set->IsNull(value_set->length() - 1)) {
+    // If the last one is null we know it's the only
+    // one because of the call to `Unique` above.
+    value_set = value_set->Slice(0, value_set->length() - 1);
+  }
+  return value_set;
+}
+
+/// Get the value set for an `is_in` predicate for simplification.
+///
+/// If the `is_in` predicate is being simplified, then we look up the value set
+/// from the simplification context. Otherwise, we return a prepared value set,
+/// memoized in the `is_in_call` kernel state.
+///
+/// \pre `is_in_call` is a call to the `is_in` function
+/// \return the value set to be simplified, guaranteed to be sorted with no
+///         duplicate or null values and cast to the given type
+Result<std::shared_ptr<Array>> GetIsInValueSetForSimplification(
+    const Expression::Call* is_in_call,
+    const TypeHolder& type,
+    SimplificationContext& context) {
+  DCHECK_EQ(is_in_call->function_name, "is_in");
+  std::shared_ptr<Array>& value_set = context.is_in_value_sets[is_in_call];
+  if (!value_set) {
+    // Simplification for `is_in` requires that the value set is preprocessed.
+    // We store the prepared value set in the kernel state to avoid repeated
+    // preprocessing across calls to `SimplifyWithGuarantee`.
+    auto state = checked_pointer_cast<internal::SetLookupStateBase>(
+        is_in_call->kernel_state);
+    if (!state->sorted_and_unique_value_set) {
+      auto options = 
checked_pointer_cast<SetLookupOptions>(is_in_call->options);
+      auto unprepared_value_set = options->value_set.make_array();
+      ARROW_ASSIGN_OR_RAISE(state->sorted_and_unique_value_set,
+                            PrepareIsInValueSet(unprepared_value_set));
+    }
+    if (!state->sorted_and_unique_value_set->type()->Equals(*type)) {
+      ARROW_ASSIGN_OR_RAISE(
+          state->sorted_and_unique_value_set,

Review Comment:
   I think that this should at least initially be simplified to avoid 
sorting/uniquing sets altogether. Initially winnowing of value sets should be 
accomplished by filtering with the guarantee.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to