[ https://issues.apache.org/jira/browse/ARROW-2073?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16356878#comment-16356878 ]
ASF GitHub Bot commented on ARROW-2073: --------------------------------------- xhochy closed pull request #1572: ARROW-2073: [Python] Create struct array from sequence of tuples URL: https://github.com/apache/arrow/pull/1572 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 1e431c29f..f0e5449b6 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -771,18 +771,21 @@ class StructConverter : public TypedConverterVisitor<StructBuilder, StructConver // Append a non-missing item Status AppendItem(PyObject* obj) { RETURN_NOT_OK(typed_builder_->Append()); - if (!PyDict_Check(obj)) { - return Status::TypeError("dict value expected for struct type"); + // Note heterogenous sequences are not allowed + if (ARROW_PREDICT_FALSE(source_kind_ == UNKNOWN)) { + if (PyDict_Check(obj)) { + source_kind_ = DICTS; + } else if (PyTuple_Check(obj)) { + source_kind_ = TUPLES; + } } - // NOTE we're ignoring any extraneous dict items - for (int i = 0; i < num_fields_; i++) { - PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i); - PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed - RETURN_IF_PYERROR(); - RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj ? valueobj : Py_None)); + if (PyDict_Check(obj) && source_kind_ == DICTS) { + return AppendDictItem(obj); + } else if (PyTuple_Check(obj) && source_kind_ == TUPLES) { + return AppendTupleItem(obj); + } else { + return Status::TypeError("Expected sequence of dicts or tuples for struct type"); } - - return Status::OK(); } // Append a missing item @@ -797,9 +800,33 @@ class StructConverter : public TypedConverterVisitor<StructBuilder, StructConver } protected: + Status AppendDictItem(PyObject* obj) { + // NOTE we're ignoring any extraneous dict items + for (int i = 0; i < num_fields_; i++) { + PyObject* nameobj = PyList_GET_ITEM(field_name_list_.obj(), i); + PyObject* valueobj = PyDict_GetItem(obj, nameobj); // borrowed + RETURN_IF_PYERROR(); + RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj ? valueobj : Py_None)); + } + return Status::OK(); + } + + Status AppendTupleItem(PyObject* obj) { + if (PyTuple_GET_SIZE(obj) != num_fields_) { + return Status::Invalid("Tuple size must be equal to number of struct fields"); + } + for (int i = 0; i < num_fields_; i++) { + PyObject* valueobj = PyTuple_GET_ITEM(obj, i); + RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj)); + } + return Status::OK(); + } + std::vector<std::unique_ptr<SeqConverter>> value_converters_; OwnedRef field_name_list_; int num_fields_; + // Whether we're converting from a sequence of dicts or tuples + enum { UNKNOWN, DICTS, TUPLES } source_kind_ = UNKNOWN; }; class DecimalConverter diff --git a/python/benchmarks/convert_builtins.py b/python/benchmarks/convert_builtins.py index 92b2b850f..a4dc9f262 100644 --- a/python/benchmarks/convert_builtins.py +++ b/python/benchmarks/convert_builtins.py @@ -144,11 +144,21 @@ def generate_int_list_list(self, n, min_size, max_size, partial(self.generate_int_list, none_prob=none_prob), n, min_size, max_size, none_prob) + def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB): + """ + Generate a list of tuples with random values. + Each tuple has the form `(int value, float value, bool value)` + """ + dicts = self.generate_dict_list(n, none_prob=none_prob) + tuples = [(d.get('u'), d.get('v'), d.get('w')) + if d is not None else None + for d in dicts] + assert len(tuples) == n + return tuples def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB): """ - Generate a list of dicts with a random size between *min_size* and - *max_size*. + Generate a list of dicts with random values. Each dict has the form `{'u': int value, 'v': float value, 'w': bool value}` """ ints = self.generate_int_list(n, none_prob=none_prob) @@ -179,12 +189,14 @@ def get_type_and_builtins(self, n, type_name): """ size = None - if type_name in ('bool', 'ascii', 'unicode', 'int64 list', 'struct'): + if type_name in ('bool', 'ascii', 'unicode', 'int64 list'): kind = type_name elif type_name.startswith(('int', 'uint')): kind = 'int' elif type_name.startswith('float'): kind = 'float' + elif type_name.startswith('struct'): + kind = 'struct' elif type_name == 'binary': kind = 'varying binary' elif type_name.startswith('binary'): @@ -226,6 +238,7 @@ def get_type_and_builtins(self, n, type_name): 'int64 list': partial(self.generate_int_list_list, min_size=0, max_size=20), 'struct': self.generate_dict_list, + 'struct from tuples': self.generate_tuple_list, } data = factories[kind](n) return ty, data @@ -239,7 +252,7 @@ class ConvertPyListToArray(object): types = ('int32', 'uint32', 'int64', 'uint64', 'float32', 'float64', 'bool', 'binary', 'binary10', 'ascii', 'unicode', - 'int64 list', 'struct') + 'int64 list', 'struct', 'struct from tuples') param_names = ['type'] params = [types] diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index ce54f23eb..5cd4a52a2 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -531,6 +531,45 @@ def test_struct_from_dicts(): assert arr.to_pylist() == expected +def test_struct_from_tuples(): + ty = pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_())]) + + data = [(5, 'foo', True), + (6, 'bar', False)] + expected = [{'a': 5, 'b': 'foo', 'c': True}, + {'a': 6, 'b': 'bar', 'c': False}] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + # With omitted values + data = [(5, 'foo', None), + None, + (6, None, False)] + expected = [{'a': 5, 'b': 'foo', 'c': None}, + None, + {'a': 6, 'b': None, 'c': False}] + arr = pa.array(data, type=ty) + assert arr.to_pylist() == expected + + # Invalid tuple size + for tup in [(5, 'foo'), (), ('5', 'foo', True, None)]: + with pytest.raises(ValueError, match="(?i)tuple size"): + pa.array([tup], type=ty) + + +def test_struct_from_mixed_sequence(): + # It is forbidden to mix dicts and tuples when initializing a struct array + ty = pa.struct([pa.field('a', pa.int32()), + pa.field('b', pa.string()), + pa.field('c', pa.bool_())]) + data = [(5, 'foo', True), + {'a': 6, 'b': 'bar', 'c': False}] + with pytest.raises(TypeError): + pa.array(data, type=ty) + + def test_structarray_from_arrays_coerce(): # ARROW-1706 ints = [None, 2, 3] ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Create StructArray from sequence of tuples given a known data type > --------------------------------------------------------------------------- > > Key: ARROW-2073 > URL: https://issues.apache.org/jira/browse/ARROW-2073 > Project: Apache Arrow > Issue Type: Improvement > Components: Python > Reporter: Antoine Pitrou > Assignee: Antoine Pitrou > Priority: Major > Labels: pull-request-available > > Following ARROW-1705, we should support calling {{pa.array}} with a sequence > of tuples, presuming a struct type is passed for the {{type}} parameter. > We also probably want to disallow mixed inputs, e.g. a sequence of both dicts > and tuples. The user should use only one idiom at a time. -- This message was sent by Atlassian JIRA (v7.6.3#76005)