wesm commented on code in PR #13521: URL: https://github.com/apache/arrow/pull/13521#discussion_r916212441
########## cpp/src/arrow/compute/cast.cc: ########## @@ -213,36 +201,48 @@ Result<const Kernel*> CastFunction::DispatchExact( return candidate_kernels[0]; } +Result<std::shared_ptr<CastFunction>> GetCastFunction(const TypeHolder& to_type) { + return internal::GetCastFunctionInternal(to_type); Review Comment: done ########## cpp/src/arrow/compute/function.cc: ########## @@ -186,87 +170,106 @@ const Kernel* DispatchExactImpl(const Function* func, } // namespace detail Result<const Kernel*> Function::DispatchExact( - const std::vector<ValueDescr>& values) const { + const std::vector<TypeHolder>& values) const { if (kind_ == Function::META) { return Status::NotImplemented("Dispatch for a MetaFunction's Kernels"); } - RETURN_NOT_OK(CheckArity(values)); + RETURN_NOT_OK(CheckArity(values.size())); if (auto kernel = detail::DispatchExactImpl(this, values)) { return kernel; } return detail::NoMatchingKernel(this, values); } -Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const { +Result<const Kernel*> Function::DispatchBest(std::vector<TypeHolder>* values) const { // TODO(ARROW-11508) permit generic conversions here return DispatchExact(*values); } -Result<Datum> Function::Execute(const std::vector<Datum>& args, - const FunctionOptions* options, ExecContext* ctx) const { - return ExecuteInternal(args, /*passed_length=*/-1, options, ctx); +namespace { + +/// \brief Check that each Datum is of a "value" type, which means either +/// SCALAR, ARRAY, or CHUNKED_ARRAY. +Status CheckAllValues(const std::vector<Datum>& values) { + for (const auto& value : values) { + if (!value.is_value()) { + return Status::Invalid("Tried executing function with non-value type: ", + value.ToString()); + } + } + return Status::OK(); } -Result<Datum> Function::Execute(const ExecBatch& batch, const FunctionOptions* options, - ExecContext* ctx) const { - return ExecuteInternal(batch.values, batch.length, options, ctx); +Status CheckOptions(const Function& function, const FunctionOptions* options) { + if (options == nullptr && function.doc().options_required) { + return Status::Invalid("Function '", function.name(), + "' cannot be called without options"); + } + return Status::OK(); } -Result<Datum> Function::ExecuteInternal(const std::vector<Datum>& args, - int64_t passed_length, - const FunctionOptions* options, - ExecContext* ctx) const { +Result<Datum> ExecuteInternal(const Function& func, std::vector<Datum> args, Review Comment: We should investigate later, especially for the overly common cases where execution has 1, 2, or 3 arguments. ########## cpp/src/arrow/compute/cast.cc: ########## @@ -69,9 +69,9 @@ void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTabl // Private version of GetCastFunction with better error reporting // if the input type is known. Result<std::shared_ptr<CastFunction>> GetCastFunctionInternal( - const std::shared_ptr<DataType>& to_type, const DataType* from_type = nullptr) { + const TypeHolder& to_type, const DataType* from_type = nullptr) { Review Comment: done ########## cpp/src/arrow/compute/exec.cc: ########## @@ -995,14 +996,24 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> { ExecSpanIterator span_iterator_; }; +Status CheckCanExecuteChunked(const VectorKernel* kernel) { Review Comment: done ########## cpp/src/arrow/scalar.h: ########## @@ -479,37 +480,52 @@ struct ARROW_EXPORT StructScalar : public Scalar { Result<std::shared_ptr<Scalar>> field(FieldRef ref) const; - StructScalar(ValueType value, std::shared_ptr<DataType> type) - : Scalar(std::move(type), true), value(std::move(value)) {} + StructScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true) + : Scalar(std::move(type), is_valid), value(std::move(value)) {} static Result<std::shared_ptr<StructScalar>> Make(ValueType value, std::vector<std::string> field_names); - - explicit StructScalar(std::shared_ptr<DataType> type) : Scalar(std::move(type)) {} }; struct ARROW_EXPORT UnionScalar : public Scalar { - using Scalar::Scalar; - using ValueType = std::shared_ptr<Scalar>; - - ValueType value; int8_t type_code; - UnionScalar(int8_t type_code, std::shared_ptr<DataType> type) - : Scalar(std::move(type), false), type_code(type_code) {} - - UnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> type) - : Scalar(std::move(type), true), value(std::move(value)), type_code(type_code) {} + protected: + UnionScalar(std::shared_ptr<DataType> type, int8_t type_code, bool is_valid) + : Scalar(std::move(type), is_valid), type_code(type_code) {} }; struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { - using UnionScalar::UnionScalar; using TypeClass = SparseUnionType; + + // Even though only one of the union values is relevant for this scalar, we + // nonetheless construct a vector of scalars, one per union value, to have + // enough data to reconstruct a valid ArraySpan of length 1 from this scalar + using ValueType = std::vector<std::shared_ptr<Scalar>>; + ValueType value; + + // The value index corresponding to the active type code + int child_id; + + SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> type); + + /// \brief Construct a SparseUnionScalar from a single value, versus having + /// to construct a vector of scalars + static std::shared_ptr<Scalar> FromValue(std::shared_ptr<Scalar> value, int field_index, + std::shared_ptr<DataType> type); }; struct ARROW_EXPORT DenseUnionScalar : public UnionScalar { - using UnionScalar::UnionScalar; using TypeClass = DenseUnionType; + + // For DenseUnionScalar, we can make a valid ArraySpan of length 1 from this + // scalar + using ValueType = std::shared_ptr<Scalar>; + ValueType value; Review Comment: done ########## cpp/src/arrow/scalar.h: ########## @@ -479,37 +480,52 @@ struct ARROW_EXPORT StructScalar : public Scalar { Result<std::shared_ptr<Scalar>> field(FieldRef ref) const; - StructScalar(ValueType value, std::shared_ptr<DataType> type) - : Scalar(std::move(type), true), value(std::move(value)) {} + StructScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true) + : Scalar(std::move(type), is_valid), value(std::move(value)) {} static Result<std::shared_ptr<StructScalar>> Make(ValueType value, std::vector<std::string> field_names); - - explicit StructScalar(std::shared_ptr<DataType> type) : Scalar(std::move(type)) {} }; struct ARROW_EXPORT UnionScalar : public Scalar { - using Scalar::Scalar; - using ValueType = std::shared_ptr<Scalar>; - - ValueType value; int8_t type_code; - UnionScalar(int8_t type_code, std::shared_ptr<DataType> type) - : Scalar(std::move(type), false), type_code(type_code) {} - - UnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> type) - : Scalar(std::move(type), true), value(std::move(value)), type_code(type_code) {} + protected: + UnionScalar(std::shared_ptr<DataType> type, int8_t type_code, bool is_valid) + : Scalar(std::move(type), is_valid), type_code(type_code) {} }; struct ARROW_EXPORT SparseUnionScalar : public UnionScalar { - using UnionScalar::UnionScalar; using TypeClass = SparseUnionType; + + // Even though only one of the union values is relevant for this scalar, we + // nonetheless construct a vector of scalars, one per union value, to have + // enough data to reconstruct a valid ArraySpan of length 1 from this scalar + using ValueType = std::vector<std::shared_ptr<Scalar>>; + ValueType value; + + // The value index corresponding to the active type code + int child_id; Review Comment: yes, and adding it as virtual on UnionScalar ########## cpp/src/arrow/compute/kernel.h: ########## @@ -227,21 +208,16 @@ class ARROW_EXPORT InputType { /// \brief Render a human-readable string representation. std::string ToString() const; - /// \brief Return true if the value matches this argument kind in type - /// and shape. + /// \brief Return true if the Datum matches this argument kind in + /// type (and only allows scalar or array-like Datums). bool Matches(const Datum& value) const; - /// \brief Return true if the value descriptor matches this argument kind in - /// type and shape. - bool Matches(const ValueDescr& value) const; + /// \brief Return true if the type matches this InputType + bool Matches(const TypeHolder& type) const; Review Comment: yes definitely, will fix ########## cpp/src/arrow/compute/exec.cc: ########## @@ -328,8 +321,17 @@ bool ExecBatchIterator::Next(ExecBatch* batch) { // ---------------------------------------------------------------------- // ExecSpanIterator; to eventually replace ExecBatchIterator -Status ExecSpanIterator::Init(const ExecBatch& batch, ValueDescr::Shape output_shape, - int64_t max_chunksize) { +bool CheckIfAllScalar(const ExecBatch& batch) { Review Comment: done ########## cpp/src/arrow/compute/exec.cc: ########## @@ -784,30 +811,28 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> { Datum WrapResults(const std::vector<Datum>& inputs, const std::vector<Datum>& outputs) override { - if (output_descr_.shape == ValueDescr::SCALAR) { - // TODO(wesm): to remove, see ARROW-16757 - DCHECK_EQ(outputs.size(), 1); - // Return as SCALAR - return outputs[0]; + // If execution yielded multiple chunks (because large arrays were split + // based on the ExecContext parameters, then the result is a ChunkedArray + if (HaveChunkedArray(inputs) || outputs.size() > 1) { + return ToChunkedArray(outputs, output_type_); } else { - // If execution yielded multiple chunks (because large arrays were split - // based on the ExecContext parameters, then the result is a ChunkedArray - if (HaveChunkedArray(inputs) || outputs.size() > 1) { - return ToChunkedArray(outputs, output_descr_.type); - } else if (outputs.size() == 1) { - // Outputs have just one element - return outputs[0]; - } else { - // XXX: In the case where no outputs are omitted, is returning a 0-length - // array always the correct move? - return MakeArrayOfNull(output_descr_.type, /*length=*/0, - exec_context()->memory_pool()) - .ValueOrDie(); - } + // Outputs have just one element + return outputs[0]; } } protected: + Status EmitResult(std::shared_ptr<ArrayData> out, ExecListener* listener) { Review Comment: It's move-d elsewhere, though I'm guessing that an `std::move` operation is never going to show up in any profile output ########## cpp/src/arrow/compute/function.cc: ########## @@ -186,87 +170,106 @@ const Kernel* DispatchExactImpl(const Function* func, } // namespace detail Result<const Kernel*> Function::DispatchExact( - const std::vector<ValueDescr>& values) const { + const std::vector<TypeHolder>& values) const { if (kind_ == Function::META) { return Status::NotImplemented("Dispatch for a MetaFunction's Kernels"); } - RETURN_NOT_OK(CheckArity(values)); + RETURN_NOT_OK(CheckArity(values.size())); if (auto kernel = detail::DispatchExactImpl(this, values)) { return kernel; } return detail::NoMatchingKernel(this, values); } -Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const { +Result<const Kernel*> Function::DispatchBest(std::vector<TypeHolder>* values) const { // TODO(ARROW-11508) permit generic conversions here return DispatchExact(*values); } -Result<Datum> Function::Execute(const std::vector<Datum>& args, - const FunctionOptions* options, ExecContext* ctx) const { - return ExecuteInternal(args, /*passed_length=*/-1, options, ctx); +namespace { + +/// \brief Check that each Datum is of a "value" type, which means either +/// SCALAR, ARRAY, or CHUNKED_ARRAY. Review Comment: This comment predates this PR. I'll change it though ########## cpp/src/arrow/compute/function.cc: ########## @@ -275,9 +278,9 @@ Result<Datum> Function::ExecuteInternal(const std::vector<Datum>& args, bool all_same_length = false; int64_t inferred_length = detail::InferBatchLength(input.values, &all_same_length); input.length = inferred_length; - if (kind() == Function::SCALAR) { + if (func.kind() == Function::SCALAR) { DCHECK(passed_length == -1 || passed_length == inferred_length); Review Comment: done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org