bkietz commented on a change in pull request #8088: URL: https://github.com/apache/arrow/pull/8088#discussion_r483815010
########## File path: cpp/src/arrow/util/converter.h ########## @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <string> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template <typename Input, typename Options> +class ArrayConverter { + public: + using InputType = Input; + using OptionsType = Options; + + ArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, Options options) + : sp_type_(type), sp_builder_(builder), options_(options) {} + + virtual ~ArrayConverter() = default; + const std::shared_ptr<ArrayBuilder>& builder() const { return sp_builder_; } + const std::shared_ptr<DataType>& type() const { return sp_type_; } + Options options() const { return options_; } + + virtual Status Init() { return Status::OK(); } + virtual Status Reserve(int64_t additional_capacity) = 0; + virtual Status Append(InputType value) = 0; + virtual Status AppendNull() = 0; + virtual Status Extend(Input seq, int64_t size) = 0; + virtual Result<std::shared_ptr<Array>> Finish() = 0; + + protected: + const std::shared_ptr<DataType> sp_type_; + std::shared_ptr<ArrayBuilder> sp_builder_; + Options options_; +}; + +template <typename T, typename BaseConverter, + typename BuilderType = typename TypeTraits<T>::BuilderType> +class TypedArrayConverter : public BaseConverter { + public: + TypedArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, + typename BaseConverter::OptionsType options) + : BaseConverter(type, builder, options), + type_(checked_cast<const T&>(*type)), + builder_(checked_cast<BuilderType*>(builder.get())) {} + + Status Reserve(int64_t additional_capacity) override { + return this->builder_->Reserve(additional_capacity); + } + + Status AppendNull() override { return this->builder_->AppendNull(); } + + Result<std::shared_ptr<Array>> Finish() override { return builder_->Finish(); }; + + protected: + const T& type_; + BuilderType* builder_; +}; + +template <typename T, typename BaseConverter> +class PrimitiveArrayConverter : public TypedArrayConverter<T, BaseConverter> { + public: + using TypedArrayConverter<T, BaseConverter>::TypedArrayConverter; +}; + +template <typename T, typename BaseConverter> +class DictionaryArrayConverter + : public TypedArrayConverter<DictionaryType, BaseConverter, DictionaryBuilder<T>> { + public: + DictionaryArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, + typename BaseConverter::OptionsType options) + : TypedArrayConverter<DictionaryType, BaseConverter, DictionaryBuilder<T>>( + type, builder, options), + value_type_(checked_cast<const T&>( + *checked_cast<const DictionaryType&>(*type).value_type())) {} + + protected: + const T& value_type_; +}; + +template <typename T, typename BaseConverter> +class ListArrayConverter : public TypedArrayConverter<T, BaseConverter> { + public: + ListArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, + std::shared_ptr<BaseConverter> value_converter, + typename BaseConverter::OptionsType options) + : TypedArrayConverter<T, BaseConverter>(type, builder, options), + value_converter_(std::move(value_converter)) {} + + protected: + std::shared_ptr<BaseConverter> value_converter_; +}; + +template <typename T, typename BaseConverter> +class StructArrayConverter : public TypedArrayConverter<T, BaseConverter> { + public: + StructArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, + std::vector<std::shared_ptr<BaseConverter>> child_converters, + typename BaseConverter::OptionsType options) + : TypedArrayConverter<T, BaseConverter>(type, builder, options), + child_converters_(std::move(child_converters)) {} + + protected: + std::vector<std::shared_ptr<BaseConverter>> child_converters_; +}; + +#define DICTIONARY_CASE(TYPE_ENUM, TYPE_CLASS) \ + case Type::TYPE_ENUM: \ + out->reset(new DictionaryConverter<TYPE_CLASS>(type, std::move(builder), options)); \ + break; + +template <typename Options, typename BaseConverter, + template <typename...> class PrimitiveConverter, + template <typename...> class DictionaryConverter, + template <typename...> class ListConverter, + template <typename...> class StructConverter> Review comment: Instead of requiring concrete converters be class templates like this, please let generic construction of converters wait for a follow up. Construct ArrayBuilders using `MakeBuilder` and move any other converter construction code from ArrayConverterBuilder into `python_to_arrow.cc`. ########## File path: cpp/src/arrow/util/converter.h ########## @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <string> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template <typename Input, typename Options> +class ArrayConverter { + public: + using InputType = Input; + using OptionsType = Options; + + ArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, Options options) + : sp_type_(type), sp_builder_(builder), options_(options) {} + + virtual ~ArrayConverter() = default; + const std::shared_ptr<ArrayBuilder>& builder() const { return sp_builder_; } + const std::shared_ptr<DataType>& type() const { return sp_type_; } + Options options() const { return options_; } + + virtual Status Init() { return Status::OK(); } + virtual Status Reserve(int64_t additional_capacity) = 0; + virtual Status Append(InputType value) = 0; + virtual Status AppendNull() = 0; + virtual Status Extend(Input seq, int64_t size) = 0; Review comment: I don't think it makes sense to make `Extend()` a method of this interface: I wouldn't expect that all converted `InputType`s are unboxable as sequences or scalars in the way that `PyObject*` happens to be. Even in the specific case of `PyObject*` which has this property, Extend() needs to be supplemented with ExtendMasked() if a mask is present in addition to the sequence. I think Extend() should be moved to PyArrayConverter for now ########## File path: cpp/src/arrow/util/hashing.h ########## @@ -851,6 +851,11 @@ struct HashTraits<T, enable_if_t<has_string_view<T>::value && using MemoTableType = BinaryMemoTable<BinaryBuilder>; }; +template <> +struct HashTraits<Decimal128Type> { + using MemoTableType = BinaryMemoTable<BinaryBuilder>; Review comment: `Decimal128` is pretty small, I think this would work: ```suggestion using MemoTableType = ScalarMemoTable<Decimal128, HashTable>; ``` ########## File path: cpp/src/arrow/util/converter.h ########## @@ -0,0 +1,281 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <string> +#include <utility> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/chunked_array.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" + +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +template <typename Input, typename Options> +class ArrayConverter { + public: + using InputType = Input; + using OptionsType = Options; + + ArrayConverter(const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder> builder, Options options) + : sp_type_(type), sp_builder_(builder), options_(options) {} + + virtual ~ArrayConverter() = default; + const std::shared_ptr<ArrayBuilder>& builder() const { return sp_builder_; } + const std::shared_ptr<DataType>& type() const { return sp_type_; } + Options options() const { return options_; } + + virtual Status Init() { return Status::OK(); } + virtual Status Reserve(int64_t additional_capacity) = 0; + virtual Status Append(InputType value) = 0; + virtual Status AppendNull() = 0; + virtual Status Extend(Input seq, int64_t size) = 0; + virtual Result<std::shared_ptr<Array>> Finish() = 0; + + protected: + const std::shared_ptr<DataType> sp_type_; + std::shared_ptr<ArrayBuilder> sp_builder_; + Options options_; +}; + +template <typename T, typename BaseConverter, + typename BuilderType = typename TypeTraits<T>::BuilderType> +class TypedArrayConverter : public BaseConverter { Review comment: I don't think the TypedArrayConverter and its subclasses need to be public. This framework would be more understandable if (for example) instead of ```c++ template <typename T> class PyStructArrayConverter : public StructArrayConverter<T, PyArrayConverter> { ``` we had ```c++ class PyStructArrayConverter : public PyArrayConverter ``` It seems the `TypedArrayConverter`s and base classes are mainly defined to enforce regular construction of the various converter types and to allow child builders to be wrapped with the corresponding child converters. Instead of introducing ArrayConverterBuilder, I think it'd be better to reuse `MakeBuilder` for consistency then visit the resulting builder to hydrate the converter hierarchy. For example, we could make `sp_builder_` an argument to `Init()`. In any case, IMHO the number and complexity of base classes should be kept to a minimum to simplify the implementation process; for example it's surprising that PyStructArrayConverter must be a template which is only instantiated with `argument=StructType` ########## File path: cpp/src/arrow/python/python_test.cc ########## @@ -422,17 +421,15 @@ TEST_F(DecimalTest, TestNoneAndNaN) { ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); - std::shared_ptr<ChunkedArray> arr, arr_from_pandas; PyConversionOptions options; - ASSERT_RAISES(TypeError, ConvertPySequence(list, options, &arr)); + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); options.from_pandas = true; - ASSERT_OK(ConvertPySequence(list, options, &arr_from_pandas)); - auto c0 = arr_from_pandas->chunk(0); - ASSERT_TRUE(c0->IsValid(0)); - ASSERT_TRUE(c0->IsNull(1)); - ASSERT_TRUE(c0->IsNull(2)); - ASSERT_TRUE(c0->IsNull(3)); + auto arr = ConvertPySequence(list, nullptr, options).ValueOrDie(); Review comment: ```suggestion ASSERT_OK_AND_ASSIGN(auto arr, ConvertPySequence(list, nullptr, options)); ``` ########## File path: cpp/src/arrow/python/python_to_arrow.cc ########## @@ -1347,64 +841,49 @@ Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* siz return Status::OK(); } -Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr<ChunkedArray>* out) { +Result<std::shared_ptr<Array>> ConvertPySequence(PyObject* obj, PyObject* mask, + const PyConversionOptions& opts) { PyAcquireGIL lock; PyObject* seq; OwnedRef tmp_seq_nanny; + PyConversionOptions options = opts; // copy options struct since we modify it below std::shared_ptr<DataType> real_type; int64_t size = options.size; - RETURN_NOT_OK(ConvertToSequenceAndInferSize(sequence_source, &seq, &size)); + RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); tmp_seq_nanny.reset(seq); // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray - bool strict_conversions = false; + auto copied_options = options; + options.strict = false; if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); - if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { - const auto& ts_type = checked_cast<const TimestampType&>(*real_type); - real_type = timestamp(ts_type.unit()); - } + // TODO(kszucs): remove this + // if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { + // const auto& ts_type = checked_cast<const TimestampType&>(*real_type); + // real_type = timestamp(ts_type.unit()); + // } Review comment: Is this tested? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
