[jira] [Commented] (ARROW-2135) [Python] NaN values silently casted to int64 when passing explicit schema for conversion in Table.from_pandas

ASF GitHub Bot (JIRA) Mon, 12 Mar 2018 12:04:38 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-2135?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16395731#comment-16395731
 ]


ASF GitHub Bot commented on ARROW-2135:
---------------------------------------

wesm closed pull request #1681: ARROW-2135: [Python] Fix NaN conversion when 
casting from Numpy array
URL: https://github.com/apache/arrow/pull/1681
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/numpy-internal.h 
b/cpp/src/arrow/python/numpy-internal.h
index 8d4308065..7672861d4 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -68,6 +68,9 @@ class Ndarray1DIndexer {
   int64_t stride_;
 };
 
+// Handling of Numpy Types by their static numbers
+// (the NPY_TYPES enum and related defines)
+
 static inline std::string GetNumPyTypeName(int npy_type) {
 #define TYPE_CASE(TYPE, NAME) \
   case NPY_##TYPE:            \
@@ -79,14 +82,20 @@ static inline std::string GetNumPyTypeName(int npy_type) {
     TYPE_CASE(INT16, "int16")
     TYPE_CASE(INT32, "int32")
     TYPE_CASE(INT64, "int64")
-#if (NPY_INT64 != NPY_LONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(INT, "intc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(LONGLONG, "longlong")
 #endif
     TYPE_CASE(UINT8, "uint8")
     TYPE_CASE(UINT16, "uint16")
     TYPE_CASE(UINT32, "uint32")
     TYPE_CASE(UINT64, "uint64")
-#if (NPY_UINT64 != NPY_ULONGLONG)
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(UINT, "uintc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
     TYPE_CASE(ULONGLONG, "ulonglong")
 #endif
     TYPE_CASE(FLOAT16, "float16")
@@ -100,9 +109,48 @@ static inline std::string GetNumPyTypeName(int npy_type) {
   }
 
 #undef TYPE_CASE
-  return "unrecognized type in GetNumPyTypeName";
+  std::stringstream ss;
+  ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
+  return ss.str();
 }
 
+#define TYPE_VISIT_INLINE(TYPE) \
+  case NPY_##TYPE:              \
+    return visitor->template Visit<NPY_##TYPE>(arr);
+
+template <typename VISITOR>
+inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
+  switch (PyArray_TYPE(arr)) {
+    TYPE_VISIT_INLINE(BOOL);
+    TYPE_VISIT_INLINE(INT8);
+    TYPE_VISIT_INLINE(UINT8);
+    TYPE_VISIT_INLINE(INT16);
+    TYPE_VISIT_INLINE(UINT16);
+    TYPE_VISIT_INLINE(INT32);
+    TYPE_VISIT_INLINE(UINT32);
+    TYPE_VISIT_INLINE(INT64);
+    TYPE_VISIT_INLINE(UINT64);
+#if !NPY_INT32_IS_INT
+    TYPE_VISIT_INLINE(INT);
+    TYPE_VISIT_INLINE(UINT);
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_VISIT_INLINE(LONGLONG);
+    TYPE_VISIT_INLINE(ULONGLONG);
+#endif
+    TYPE_VISIT_INLINE(FLOAT16);
+    TYPE_VISIT_INLINE(FLOAT32);
+    TYPE_VISIT_INLINE(FLOAT64);
+    TYPE_VISIT_INLINE(DATETIME);
+    TYPE_VISIT_INLINE(OBJECT);
+  }
+  std::stringstream ss;
+  ss << "NumPy type not implemented: " << GetNumPyTypeName(PyArray_TYPE(arr));
+  return Status::NotImplemented(ss.str());
+}
+
+#undef TYPE_VISIT_INLINE
+
 }  // namespace py
 }  // namespace arrow
 
diff --git a/cpp/src/arrow/python/numpy_interop.h 
b/cpp/src/arrow/python/numpy_interop.h
index 8c569e232..0715c66c5 100644
--- a/cpp/src/arrow/python/numpy_interop.h
+++ b/cpp/src/arrow/python/numpy_interop.h
@@ -43,6 +43,31 @@
 #include <numpy/arrayscalars.h>
 #include <numpy/ufuncobject.h>
 
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+//   NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+
+#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
+#define NPY_INT64_IS_LONG_LONG 1
+#else
+#define NPY_INT64_IS_LONG_LONG 0
+#endif
+
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#define NPY_INT32_IS_INT 1
+#else
+#define NPY_INT32_IS_INT 0
+#endif
+
 namespace arrow {
 namespace py {
 
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 04a71c1f6..6ddc4a7be 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -84,6 +84,38 @@ inline bool PyObject_is_integer(PyObject* obj) {
   return !PyBool_Check(obj) && PyArray_IsIntegerScalar(obj);
 }
 
+Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
+  if (PyArray_NDIM(numpy_array) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " 
but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
+  }
+
+  return Status::OK();
+}
+
+Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
+                          std::shared_ptr<ResizableBuffer>* out) {
+  int64_t null_bytes = BitUtil::BytesForBits(length);
+  std::shared_ptr<ResizableBuffer> null_bitmap;
+
+  null_bitmap = std::make_shared<PoolBuffer>(pool);
+  RETURN_NOT_OK(null_bitmap->Resize(null_bytes));
+
+  memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
+  *out = null_bitmap;
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow null bitmap
+
 template <int TYPE>
 inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   typedef internal::npy_traits<TYPE> traits;
@@ -103,6 +135,55 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* 
bitmap) {
   return null_count;
 }
 
+class NumPyNullsConverter {
+ public:
+  /// Convert the given array's null values to a null bitmap.
+  /// The null bitmap is only allocated if null values are ever possible.
+  static Status Convert(MemoryPool* pool, PyArrayObject* arr,
+                        bool use_pandas_null_sentinels,
+                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+                        int64_t* out_null_count) {
+    NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out_null_bitmap_ = converter.null_bitmap_;
+    *out_null_count = converter.null_count_;
+    return Status::OK();
+  }
+
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    typedef internal::npy_traits<TYPE> traits;
+
+    const bool null_sentinels_possible =
+        // Always treat Numpy's NaT as null
+        TYPE == NPY_DATETIME ||
+        // Observing pandas's null sentinels
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    if (null_sentinels_possible) {
+      RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), 
&null_bitmap_));
+      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr,
+                      bool use_pandas_null_sentinels)
+      : pool_(pool),
+        arr_(arr),
+        use_pandas_null_sentinels_(use_pandas_null_sentinels),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {}
+
+  MemoryPool* pool_;
+  PyArrayObject* arr_;
+  bool use_pandas_null_sentinels_;
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+  int64_t null_count_;
+};
+
 // Returns null count
 int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
   int64_t null_count = 0;
@@ -119,22 +200,6 @@ int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, 
uint8_t* bitmap) {
   return null_count;
 }
 
-Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
-  if (PyArray_NDIM(numpy_array) != 1) {
-    return Status::Invalid("only handle 1-dimensional arrays");
-  }
-
-  const int received_type = PyArray_DESCR(numpy_array)->type_num;
-  if (received_type != np_type) {
-    std::stringstream ss;
-    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " 
but got "
-       << GetNumPyTypeName(received_type);
-    return Status::Invalid(ss.str());
-  }
-
-  return Status::OK();
-}
-
 }  // namespace
 
 /// Append as many string objects from NumPy arrays to a `StringBuilder` as we
@@ -301,7 +366,9 @@ class NumPyConverter {
         dtype_(PyArray_DESCR(arr_)),
         mask_(nullptr),
         use_pandas_null_sentinels_(use_pandas_null_sentinels),
-        decimal_type_() {
+        decimal_type_(),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {
     if (mo != nullptr && mo != Py_None) {
       mask_ = reinterpret_cast<PyArrayObject*>(mo);
     }
@@ -356,14 +423,8 @@ class NumPyConverter {
 
  protected:
   Status InitNullBitmap() {
-    int64_t null_bytes = BitUtil::BytesForBits(length_);
-
-    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
-    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
-
+    RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
     null_bitmap_data_ = null_bitmap_->mutable_data();
-    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
-
     return Status::OK();
   }
 
@@ -414,32 +475,18 @@ class NumPyConverter {
 
   template <typename ArrowType>
   Status VisitNative() {
-    using traits = internal::arrow_traits<ArrowType::type_id>;
-
-    const bool null_sentinels_possible =
-        // NumPy has a NaT type
-        (ArrowType::type_id == Type::TIMESTAMP || ArrowType::type_id == 
Type::DATE32) ||
-
-        // Observing pandas's null sentinels
-        ((use_pandas_null_sentinels_ && traits::supports_nulls));
-
-    if (mask_ != nullptr || null_sentinels_possible) {
+    if (mask_ != nullptr) {
       RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, 
use_pandas_null_sentinels_,
+                                                 &null_bitmap_, &null_count_));
     }
 
     std::shared_ptr<Buffer> data;
     RETURN_NOT_OK(ConvertData<ArrowType>(&data));
 
-    int64_t null_count = 0;
-    if (mask_ != nullptr) {
-      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
-    } else if (null_sentinels_possible) {
-      // TODO(wesm): this presumes the NumPy C type and arrow C type are the
-      // same
-      null_count = ValuesToBitmap<traits::npy_type>(arr_, null_bitmap_data_);
-    }
-
-    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, 
null_count, 0);
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, 
null_count_, 0);
     return PushArray(arr_data);
   }
 
@@ -493,6 +540,7 @@ class NumPyConverter {
 
   std::shared_ptr<ResizableBuffer> null_bitmap_;
   uint8_t* null_bitmap_data_;
+  int64_t null_count_;
 };
 
 Status NumPyConverter::Convert() {
@@ -659,12 +707,10 @@ inline Status 
NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
       Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, 
data);
       RETURN_NOT_OK(s);
     } else {
-      // TODO(wesm): This is redundant, and recomputed in VisitNative()
-      const int64_t null_count = ValuesToBitmap<NPY_DATETIME>(arr_, 
null_bitmap_data_);
-
       RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), 
&input_type));
       if (!input_type->Equals(*type_)) {
-        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count,
+        // The null bitmap was already computed in VisitNative()
+        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, 
null_count_,
                                  type_, pool_, data));
       }
     }
diff --git a/cpp/src/arrow/python/type_traits.h 
b/cpp/src/arrow/python/type_traits.h
index 587b27c9a..ff39aad67 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -34,6 +34,9 @@ namespace arrow {
 namespace py {
 namespace internal {
 
+//
+// Type traits for Numpy -> Arrow equivalence
+//
 template <int TYPE>
 struct npy_traits {};
 
@@ -68,7 +71,11 @@ NPY_INT_DECL(UINT16, UInt16, uint16_t);
 NPY_INT_DECL(UINT32, UInt32, uint32_t);
 NPY_INT_DECL(UINT64, UInt64, uint64_t);
 
-#if NPY_INT64 != NPY_LONGLONG
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+NPY_INT_DECL(INT, Int32, int32_t);
+NPY_INT_DECL(UINT, UInt32, uint32_t);
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
 NPY_INT_DECL(LONGLONG, Int64, int64_t);
 NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
 #endif
@@ -127,8 +134,14 @@ template <>
 struct npy_traits<NPY_OBJECT> {
   typedef PyObject* value_type;
   static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(PyObject* v) { return v == Py_None; }
 };
 
+//
+// Type traits for Arrow -> Numpy equivalence
+// Note *supports_nulls* means the equivalent Numpy type support nulls
+//
 template <int TYPE>
 struct arrow_traits {};
 
@@ -252,30 +265,27 @@ struct arrow_traits<Type::BINARY> {
 static inline int NumPyTypeSize(int npy_type) {
   switch (npy_type) {
     case NPY_BOOL:
-      return 1;
     case NPY_INT8:
-      return 1;
-    case NPY_INT16:
-      return 2;
-    case NPY_INT32:
-      return 4;
-    case NPY_INT64:
-      return 8;
-#if (NPY_INT64 != NPY_LONGLONG)
-    case NPY_LONGLONG:
-      return 8;
-#endif
     case NPY_UINT8:
       return 1;
+    case NPY_INT16:
     case NPY_UINT16:
       return 2;
+    case NPY_INT32:
     case NPY_UINT32:
       return 4;
+    case NPY_INT64:
     case NPY_UINT64:
       return 8;
-#if (NPY_UINT64 != NPY_ULONGLONG)
+#if !NPY_INT32_IS_INT
+    case NPY_INT:
+    case NPY_UINT:
+      return NPY_BITSOF_INT / 8;
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    case NPY_LONGLONG:
     case NPY_ULONGLONG:
-      return 8;
+      return NPY_BITSOF_LONGLONG / 8;
 #endif
     case NPY_FLOAT16:
       return 2;
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 5abc026bf..aef3dca12 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -501,6 +501,14 @@ def test_float_nulls(self):
         result = table.to_pandas()
         tm.assert_frame_equal(result, ex_frame)
 
+    def test_float_nulls_to_ints(self):
+        # ARROW-2135
+        df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
+        schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
+        table = pa.Table.from_pandas(df, schema=schema)
+        assert table[0].to_pylist() == [1, 2, None]
+        tm.assert_frame_equal(df, table.to_pandas())
+
     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []
@@ -526,6 +534,17 @@ def test_integer_no_nulls(self):
         schema = pa.schema(fields)
         _check_pandas_roundtrip(df, expected_schema=schema)
 
+    def test_all_integer_types(self):
+        # Test all Numpy integer aliases
+        data = OrderedDict()
+        numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
+                        'byte', 'ubyte', 'short', 'ushort', 'intc', 'uintc',
+                        'int_', 'uint', 'longlong', 'ulonglong']
+        for dtype in numpy_dtypes:
+            data[dtype] = np.arange(12, dtype=dtype)
+        df = pd.DataFrame(data)
+        _check_pandas_roundtrip(df)
+
     def test_integer_with_nulls(self):
         # pandas requires upcast to float dtype
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] NaN values silently casted to int64 when passing explicit schema for 
> conversion in Table.from_pandas
> -------------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-2135
>                 URL: https://issues.apache.org/jira/browse/ARROW-2135
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Matthew Gilbert
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> If you create a {{Table}} from a {{DataFrame}} of ints with a NaN value the 
> NaN is improperly cast. Since pandas casts these to floats, when converted to 
> a table the NaN is interpreted as an integer. This seems like a bug since a 
> known limitation in pandas (the inability to have null valued integers data) 
> is taking precedence over arrow's functionality to store these as an IntArray 
> with nulls.
>  
> {code}
> import pyarrow as pa
> import pandas as pd
> df = pd.DataFrame({"a":[1, 2, pd.np.NaN]})
> schema = pa.schema([pa.field("a", pa.int64(), nullable=True)])
> table = pa.Table.from_pandas(df, schema=schema)
> table[0]
> <pyarrow.lib.Column object at 0x7f2151d19c90>
> chunk 0: <pyarrow.lib.Int64Array object at 0x7f213bf356d8>
> [
>   1,
>   2,
>   -9223372036854775808
> ]{code}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2135) [Python] NaN values silently casted to int64 when passing explicit schema for conversion in Table.from_pandas

Reply via email to