[ 
https://issues.apache.org/jira/browse/ARROW-2135?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16381023#comment-16381023
 ] 

ASF GitHub Bot commented on ARROW-2135:
---------------------------------------

pitrou commented on a change in pull request #1681: ARROW-2135: [Python] Fix 
NaN conversion when casting from Numpy array
URL: https://github.com/apache/arrow/pull/1681#discussion_r171379107
 
 

 ##########
 File path: cpp/src/arrow/python/numpy_to_arrow.cc
 ##########
 @@ -113,6 +132,66 @@ inline int64_t ValuesToBitmap(PyArrayObject* arr, 
uint8_t* bitmap) {
   return null_count;
 }
 
+class NumPyNullsConverter {
+ public:
+  /// Convert the given array's null values to a null bitmap.
+  /// The null bitmap is only allocated if null values are ever possible.
+  static Status Convert(MemoryPool* pool, PyArrayObject* arr,
+                        bool use_pandas_null_sentinels,
+                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+                        int64_t* out_null_count) {
+    NumPyNullsConverter converter(pool, arr, use_pandas_null_sentinels);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out_null_bitmap_ = converter.null_bitmap_;
+    *out_null_count = converter.null_count_;
+    return Status::OK();
+  }
+
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    typedef internal::npy_traits<TYPE> traits;
+
+    const bool null_sentinels_possible =
+        // Observing pandas's null sentinels
+        (use_pandas_null_sentinels_ && traits::supports_nulls);
+
+    if (null_sentinels_possible) {
+      RETURN_NOT_OK(InitNullBitmap(PyArray_SIZE(arr)));
+      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_data_);
+    }
+    return Status::OK();
+  }
+
+  // XXX it's the same as NumPyConverter::InitNullBitmap()
+  Status InitNullBitmap(int64_t length) {
+    int64_t null_bytes = BitUtil::BytesForBits(length);
+
+    null_bitmap_ = std::make_shared<PoolBuffer>(pool_);
+    RETURN_NOT_OK(null_bitmap_->Resize(null_bytes));
+
+    null_bitmap_data_ = null_bitmap_->mutable_data();
+    memset(null_bitmap_data_, 0, static_cast<size_t>(null_bytes));
+
+    return Status::OK();
+  }
+
+ protected:
+  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr,
+                      bool use_pandas_null_sentinels)
+      : pool_(pool),
+        arr_(arr),
+        use_pandas_null_sentinels_(use_pandas_null_sentinels),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {}
+
+  MemoryPool* pool_;
+  PyArrayObject* arr_;
+  bool use_pandas_null_sentinels_;
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
 
 Review comment:
   Which iterators are you thinking about? Do you mean the ndarray 1d iterator?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] NaN values silently casted to int64 when passing explicit schema for 
> conversion in Table.from_pandas
> -------------------------------------------------------------------------------------------------------------
>
>                 Key: ARROW-2135
>                 URL: https://issues.apache.org/jira/browse/ARROW-2135
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Matthew Gilbert
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> If you create a {{Table}} from a {{DataFrame}} of ints with a NaN value the 
> NaN is improperly cast. Since pandas casts these to floats, when converted to 
> a table the NaN is interpreted as an integer. This seems like a bug since a 
> known limitation in pandas (the inability to have null valued integers data) 
> is taking precedence over arrow's functionality to store these as an IntArray 
> with nulls.
>  
> {code}
> import pyarrow as pa
> import pandas as pd
> df = pd.DataFrame({"a":[1, 2, pd.np.NaN]})
> schema = pa.schema([pa.field("a", pa.int64(), nullable=True)])
> table = pa.Table.from_pandas(df, schema=schema)
> table[0]
> <pyarrow.lib.Column object at 0x7f2151d19c90>
> chunk 0: <pyarrow.lib.Int64Array object at 0x7f213bf356d8>
> [
>   1,
>   2,
>   -9223372036854775808
> ]{code}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to