[jira] [Commented] (ARROW-1732) [Python] RecordBatch.from_pandas fails on DataFrame with no columns when preserve_index=False

ASF GitHub Bot (JIRA) Thu, 26 Oct 2017 05:16:09 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1732?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16220363#comment-16220363
 ]


ASF GitHub Bot commented on ARROW-1732:
---------------------------------------

xhochy closed pull request #1252: ARROW-1732: [Python] Permit creating record 
batches with no columns, test pandas roundtrips
URL: https://github.com/apache/arrow/pull/1252
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 694fe9190..eb1911592 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -308,8 +308,8 @@ cdef shared_ptr[const CKeyValueMetadata] 
unbox_metadata(dict metadata):
             make_shared[CKeyValueMetadata](unordered_metadata))
 
 
-cdef int _schema_from_arrays(
-        arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1:
+cdef _schema_from_arrays(arrays, names, dict metadata,
+                         shared_ptr[CSchema]* schema):
     cdef:
         Column col
         c_string c_name
@@ -317,10 +317,11 @@ cdef int _schema_from_arrays(
         shared_ptr[CDataType] type_
         Py_ssize_t K = len(arrays)
 
-    fields.resize(K)
+    if K == 0:
+        schema.reset(new CSchema(fields, unbox_metadata(metadata)))
+        return
 
-    if not K:
-        raise ValueError('Must pass at least one array')
+    fields.resize(K)
 
     if isinstance(arrays[0], Column):
         for i in range(K):
@@ -346,7 +347,6 @@ cdef int _schema_from_arrays(
             fields[i].reset(new CField(c_name, type_, True))
 
     schema.reset(new CSchema(fields, unbox_metadata(metadata)))
-    return 0
 
 
 cdef class RecordBatch:
@@ -613,10 +613,10 @@ cdef class RecordBatch:
             int64_t i
             int64_t number_of_arrays = len(arrays)
 
-        if not number_of_arrays:
-            raise ValueError('Record batch cannot contain no arrays (for now)')
-
-        num_rows = len(arrays[0])
+        if len(arrays) > 0:
+            num_rows = len(arrays[0])
+        else:
+            num_rows = 0
         _schema_from_arrays(arrays, names, metadata, &schema)
 
         c_arrays.reserve(len(arrays))
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 527466e6e..6d146f977 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -71,11 +71,11 @@ def tearDown(self):
     def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
                                 expected_schema=None,
                                 check_dtype=True, schema=None,
-                                check_index=False,
+                                preserve_index=False,
                                 as_batch=False):
         klass = pa.RecordBatch if as_batch else pa.Table
         table = klass.from_pandas(df, schema=schema,
-                                  preserve_index=check_index,
+                                  preserve_index=preserve_index,
                                   nthreads=nthreads)
 
         result = table.to_pandas(nthreads=nthreads)
@@ -83,7 +83,9 @@ def _check_pandas_roundtrip(self, df, expected=None, 
nthreads=1,
             assert table.schema.equals(expected_schema)
         if expected is None:
             expected = df
-        tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
+        tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
+                              check_index_type=('equiv' if preserve_index
+                                                else False))
 
     def _check_series_roundtrip(self, s, type_=None):
         arr = pa.array(s, from_pandas=True, type=type_)
@@ -131,14 +133,14 @@ def test_non_string_columns(self):
     def test_column_index_names_are_preserved(self):
         df = pd.DataFrame({'data': [1, 2, 3]})
         df.columns.names = ['a']
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_multiindex_columns(self):
         columns = pd.MultiIndex.from_arrays([
             ['one', 'two'], ['X', 'Y']
         ])
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_multiindex_columns_with_dtypes(self):
         columns = pd.MultiIndex.from_arrays(
@@ -149,11 +151,11 @@ def test_multiindex_columns_with_dtypes(self):
             names=['level_1', 'level_2'],
         )
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_integer_index_column(self):
         df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
-        self._check_pandas_roundtrip(df, check_index=True)
+        self._check_pandas_roundtrip(df, preserve_index=True)
 
     def test_categorical_column_index(self):
         # I *really* hope no one uses category dtypes for single level column
@@ -1095,6 +1097,15 @@ def test_table_str_to_categorical(self):
         expected = pd.DataFrame({'strings': pd.Categorical(values)})
         tm.assert_frame_equal(result, expected, check_dtype=True)
 
+    def test_table_batch_empty_dataframe(self):
+        df = pd.DataFrame({})
+        self._check_pandas_roundtrip(df)
+        self._check_pandas_roundtrip(df, as_batch=True)
+
+        df2 = pd.DataFrame({}, index=[0, 1, 2])
+        self._check_pandas_roundtrip(df2, preserve_index=True)
+        self._check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
+
     def test_array_from_pandas_date_with_mask(self):
         m = np.array([True, False, True])
         data = pd.Series([
diff --git a/python/pyarrow/tests/test_table.py 
b/python/pyarrow/tests/test_table.py
index 50190f597..428222466 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -82,6 +82,14 @@ def test_recordbatch_basics():
         batch[2]
 
 
+def test_recordbatch_no_fields():
+    batch = pa.RecordBatch.from_arrays([], [])
+
+    assert len(batch) == 0
+    assert batch.num_rows == 0
+    assert batch.num_columns == 0
+
+
 def test_recordbatch_from_arrays_invalid_names():
     data = [
         pa.array(range(5)),


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] RecordBatch.from_pandas fails on DataFrame with no columns when 
> preserve_index=False
> ---------------------------------------------------------------------------------------------
>
>                 Key: ARROW-1732
>                 URL: https://issues.apache.org/jira/browse/ARROW-1732
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Wes McKinney
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> I believe this should have well-defined behavior and not raise an error:
> {code}
> In [5]: pa.RecordBatch.from_pandas(pd.DataFrame({}), preserve_index=False)
> ---------------------------------------------------------------------------
> ValueError                                Traceback (most recent call last)
> <ipython-input-5-4dda72b47dbd> in <module>()
> ----> 1 pa.RecordBatch.from_pandas(pd.DataFrame({}), preserve_index=False)
> ~/code/arrow/python/pyarrow/table.pxi in pyarrow.lib.RecordBatch.from_pandas 
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:39957)()
>     586             df, schema, preserve_index, nthreads=nthreads
>     587         )
> --> 588         return cls.from_arrays(arrays, names, metadata)
>     589 
>     590     @staticmethod
> ~/code/arrow/python/pyarrow/table.pxi in pyarrow.lib.RecordBatch.from_arrays 
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:40130)()
>     615 
>     616         if not number_of_arrays:
> --> 617             raise ValueError('Record batch cannot contain no arrays 
> (for now)')
>     618 
>     619         num_rows = len(arrays[0])
> ValueError: Record batch cannot contain no arrays (for now)
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1732) [Python] RecordBatch.from_pandas fails on DataFrame with no columns when preserve_index=False

Reply via email to