[jira] [Commented] (ARROW-1998) [Python] Table.from_pandas crashes when data frame is empty

ASF GitHub Bot (JIRA) Mon, 12 Feb 2018 15:01:43 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-1998?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16361570#comment-16361570
 ]


ASF GitHub Bot commented on ARROW-1998:
---------------------------------------

wesm closed pull request #1594: ARROW-1998: [Python] fix crash on empty Numpy 
arrays
URL: https://github.com/apache/arrow/pull/1594
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index d487d9d9d..3dd5a79cb 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -850,16 +850,23 @@ Status NumPyConverter::ConvertObjectStrings() {
   RETURN_NOT_OK(builder.Resize(length_));
 
   bool global_have_bytes = false;
-  int64_t offset = 0;
-  while (offset < length_) {
-    bool chunk_have_bytes = false;
-    RETURN_NOT_OK(
-        AppendObjectStrings(arr_, mask_, offset, &builder, &offset, 
&chunk_have_bytes));
-
-    global_have_bytes = global_have_bytes | chunk_have_bytes;
+  if (length_ == 0) {
+    // Produce an empty chunk
     std::shared_ptr<Array> chunk;
     RETURN_NOT_OK(builder.Finish(&chunk));
     out_arrays_.emplace_back(std::move(chunk));
+  } else {
+    int64_t offset = 0;
+    while (offset < length_) {
+      bool chunk_have_bytes = false;
+      RETURN_NOT_OK(
+          AppendObjectStrings(arr_, mask_, offset, &builder, &offset, 
&chunk_have_bytes));
+
+      global_have_bytes = global_have_bytes | chunk_have_bytes;
+      std::shared_ptr<Array> chunk;
+      RETURN_NOT_OK(builder.Finish(&chunk));
+      out_arrays_.emplace_back(std::move(chunk));
+    }
   }
 
   // If we saw PyBytes, convert everything to BinaryArray
@@ -954,14 +961,21 @@ Status NumPyConverter::ConvertObjectFixedWidthBytes(
   FixedSizeBinaryBuilder builder(type, pool_);
   RETURN_NOT_OK(builder.Resize(length_));
 
-  int64_t offset = 0;
-  while (offset < length_) {
-    RETURN_NOT_OK(
-        AppendObjectFixedWidthBytes(arr_, mask_, byte_width, offset, &builder, 
&offset));
-
+  if (length_ == 0) {
+    // Produce an empty chunk
     std::shared_ptr<Array> chunk;
     RETURN_NOT_OK(builder.Finish(&chunk));
     out_arrays_.emplace_back(std::move(chunk));
+  } else {
+    int64_t offset = 0;
+    while (offset < length_) {
+      RETURN_NOT_OK(AppendObjectFixedWidthBytes(arr_, mask_, byte_width, 
offset, &builder,
+                                                &offset));
+
+      std::shared_ptr<Array> chunk;
+      RETURN_NOT_OK(builder.Finish(&chunk));
+      out_arrays_.emplace_back(std::move(chunk));
+    }
   }
   return Status::OK();
 }
@@ -1567,7 +1581,6 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, 
PyObject* mo,
   if (!PyArray_Check(ao)) {
     return Status::Invalid("Input object was not a NumPy array");
   }
-
   NumPyConverter converter(pool, ao, mo, type, use_pandas_null_sentinels);
   RETURN_NOT_OK(converter.Convert());
   const auto& output_arrays = converter.result();
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 026cd2507..2e2f66540 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -119,6 +119,26 @@ def _check_array_from_pandas_roundtrip(np_array):
 
 class TestPandasConversion(object):
 
+    type_pairs = [
+        (np.int8, pa.int8()),
+        (np.int16, pa.int16()),
+        (np.int32, pa.int32()),
+        (np.int64, pa.int64()),
+        (np.uint8, pa.uint8()),
+        (np.uint16, pa.uint16()),
+        (np.uint32, pa.uint32()),
+        (np.uint64, pa.uint64()),
+        # (np.float16, pa.float16()),  # XXX unsupported
+        (np.float32, pa.float32()),
+        (np.float64, pa.float64()),
+        # XXX unsupported
+        # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
+        (np.object, pa.string()),
+        # (np.object, pa.binary()),  # XXX unsupported
+        (np.object, pa.binary(10)),
+        (np.object, pa.list_(pa.int64())),
+        ]
+
     def test_all_none_objects(self):
         df = pd.DataFrame({'a': [None, None, None]})
         _check_pandas_roundtrip(df)
@@ -128,6 +148,11 @@ def test_all_none_category(self):
         df['a'] = df['a'].astype('category')
         _check_pandas_roundtrip(df)
 
+    def test_empty_arrays(self):
+        for dtype, pa_type in self.type_pairs:
+            arr = np.array([], dtype=dtype)
+            _check_array_roundtrip(arr, type=pa_type)
+
     def test_non_string_columns(self):
         df = pd.DataFrame({0: [1, 2, 3]})
         table = pa.Table.from_pandas(df)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Table.from_pandas crashes when data frame is empty
> -----------------------------------------------------------
>
>                 Key: ARROW-1998
>                 URL: https://issues.apache.org/jira/browse/ARROW-1998
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>         Environment: Windows 10 Build 15063.850
> Python: 3.6.3
> Numpy: 1.14.0
> Pandas: 0.22.0
>            Reporter: Victor Jimenez
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> Loading an empty CSV file, and then attempting to create a PyArrow Table from 
> it makes the application crash. The following code should be able to 
> reproduce the issue:
> {code}
> import numpy as np
> import pandas as pd
> import pyarrow as pa
> FIELDS = ['id', 'name']
> NUMPY_TYPES = {
>     'id': np.int64,
>     'name': np.unicode
> }
> PYARROW_SCHEMA = pa.schema([
>     pa.field('id', pa.int64()),
>     pa.field('name', pa.string())
> ])
> file = open('input.csv', 'w')
> file.close()
> df = pd.read_csv(
>     'input.csv',
>     header=None,
>     names=FIELDS,
>     dtype=NUMPY_TYPES,
>     engine='c',
> )
> pa.Table.from_pandas(df, schema=PYARROW_SCHEMA)
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-1998) [Python] Table.from_pandas crashes when data frame is empty

Reply via email to