[arrow] branch master updated: ARROW-2799: [Python] Add safe option to Table.from_pandas to avoid unsafe casts

wesm Sat, 08 Sep 2018 11:19:32 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new e751999  ARROW-2799: [Python] Add safe option to Table.from_pandas to 
avoid unsafe casts
e751999 is described below

commit e751999ba8f76a966d57c9226279ea05de26112b
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sat Sep 8 14:18:53 2018 -0400

    ARROW-2799: [Python] Add safe option to Table.from_pandas to avoid unsafe 
casts
    
    Depends on both #2497 and #2503
    
    Author: Krisztián Szűcs <[email protected]>
    
    Closes #2504 from kszucs/ARROW-2799 and squashes the following commits:
    
    7bf9efd36 <Krisztián Szűcs> unsafe table creation during parquet dataset 
partitioning
    38dfea3e6 <Krisztián Szűcs> disallow float truncation by default
    515a3935d <Krisztián Szűcs> Table.from_pandas safe option
    2c8207400 <Krisztián Szűcs> check-format
    80e14784a <Krisztián Szűcs> lint
    e0838ceb2 <Krisztián Szűcs> wire CastOptions through the API
    296109425 <Krisztián Szűcs> set allow_float_truncate true by default
    194e4764a <Krisztián Szűcs> allow truncate float option and its 
implementation
---
 cpp/src/arrow/compute/kernels/cast.h        |  2 +-
 python/pyarrow/array.pxi                    | 14 +++--------
 python/pyarrow/includes/libarrow.pxd        |  5 ++++
 python/pyarrow/pandas_compat.py             |  5 ++--
 python/pyarrow/parquet.py                   | 33 ++++++++++++------------
 python/pyarrow/table.pxi                    | 39 ++++++++++++-----------------
 python/pyarrow/tests/test_convert_pandas.py | 20 ++++++++++++++-
 python/pyarrow/tests/test_parquet.py        |  4 +--
 8 files changed, 65 insertions(+), 57 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast.h 
b/cpp/src/arrow/compute/kernels/cast.h
index 8392c18..49b12b9 100644
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ b/cpp/src/arrow/compute/kernels/cast.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
   CastOptions()
       : allow_int_overflow(false),
         allow_time_truncate(false),
-        allow_float_truncate(true) {}
+        allow_float_truncate(false) {}
 
   explicit CastOptions(bool safe)
       : allow_int_overflow(!safe),
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f9a16a3..362ebc6 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -53,10 +53,7 @@ cdef _ndarray_to_array(object values, object mask, DataType 
type,
     cdef:
         shared_ptr[CChunkedArray] chunked_out
         shared_ptr[CDataType] c_type
-        CCastOptions cast_options
-
-    cast_options.allow_int_overflow = not safe
-    cast_options.allow_time_truncate = not safe
+        CCastOptions cast_options = CCastOptions(safe)
 
     dtype = values.dtype
 
@@ -406,14 +403,9 @@ cdef class Array:
         casted : Array
         """
         cdef:
-            CCastOptions options
+            CCastOptions options = CCastOptions(safe)
+            DataType type = _ensure_type(target_type)
             shared_ptr[CArray] result
-            DataType type
-
-        type = _ensure_type(target_type)
-
-        options.allow_int_overflow = not safe
-        options.allow_time_truncate = not safe
 
         with nogil:
             check_status(Cast(_context(), self.ap[0], type.sp_type,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 8bbbfcf..8a91bf5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -903,8 +903,13 @@ cdef extern from "arrow/compute/api.h" namespace 
"arrow::compute" nogil:
         CFunctionContext(CMemoryPool* pool)
 
     cdef cppclass CCastOptions" arrow::compute::CastOptions":
+        CCastOptions()
+        CCastOptions(c_bool safe)
+        CCastOptions Safe()
+        CCastOptions Unsafe()
         c_bool allow_int_overflow
         c_bool allow_time_truncate
+        c_bool allow_float_truncate
 
     enum DatumType" arrow::compute::Datum::type":
         DatumType_NONE" arrow::compute::Datum::NONE"
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5ba1702..6a43fe2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,7 +316,8 @@ def _index_level_name(index, i, column_names):
         return '__index_level_{:d}__'.format(i)
 
 
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
+                        safe=True):
     if columns is None:
         columns = df.columns
     column_names = []
@@ -366,7 +367,7 @@ def dataframe_to_arrays(df, schema, preserve_index, 
nthreads=1, columns=None):
 
     def convert_column(col, ty):
         try:
-            return pa.array(col, from_pandas=True, type=ty)
+            return pa.array(col, type=ty, from_pandas=True, safe=safe)
         except (pa.ArrowInvalid,
                 pa.ArrowNotImplementedError,
                 pa.ArrowTypeError) as e:
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 6c2539c..9fa97b4 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -24,12 +24,14 @@ import re
 import numpy as np
 
 import pyarrow as pa
-import pyarrow._parquet as _parquet
 import pyarrow.lib as lib
+import pyarrow._parquet as _parquet
+
 from pyarrow._parquet import (ParquetReader, RowGroupStatistics,  # noqa
                               FileMetaData, RowGroupMetaData,
                               ColumnChunkMetaData,
                               ParquetSchema, ColumnSchema)
+from pyarrow.compat import guid
 from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem,
                                 _get_fs_from_path)
 from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads
@@ -53,6 +55,7 @@ class ParquetFile(object):
         Will be used in reads for pandas schema metadata if not found in the
         main file's metadata, no other uses at the moment
     """
+
     def __init__(self, source, metadata=None, common_metadata=None):
         self.reader = ParquetReader()
         self.reader.open(source, metadata=metadata)
@@ -1124,11 +1127,6 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
         Parameter for instantiating Table; preserve pandas index or not.
     **kwargs : dict, kwargs for write_table function.
     """
-    from pyarrow import (
-        Table,
-        compat
-    )
-
     if filesystem is None:
         fs = _get_fs_from_path(root_path)
     else:
@@ -1142,7 +1140,7 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
         data_df = df.drop(partition_cols, axis='columns')
         data_cols = df.columns.drop(partition_cols)
         if len(data_cols) == 0:
-            raise ValueError("No data left to save outside partition columns")
+            raise ValueError('No data left to save outside partition columns')
         subschema = table.schema
         # ARROW-2891: Ensure the output_schema is preserved when writing a
         # partitioned dataset
@@ -1152,21 +1150,22 @@ def write_to_dataset(table, root_path, 
partition_cols=None,
         for keys, subgroup in data_df.groupby(partition_keys):
             if not isinstance(keys, tuple):
                 keys = (keys,)
-            subdir = "/".join(
-                ["{colname}={value}".format(colname=name, value=val)
+            subdir = '/'.join(
+                ['{colname}={value}'.format(colname=name, value=val)
                  for name, val in zip(partition_cols, keys)])
-            subtable = Table.from_pandas(subgroup,
-                                         preserve_index=preserve_index,
-                                         schema=subschema)
-            prefix = "/".join([root_path, subdir])
+            subtable = pa.Table.from_pandas(subgroup,
+                                            preserve_index=preserve_index,
+                                            schema=subschema,
+                                            safe=False)
+            prefix = '/'.join([root_path, subdir])
             _mkdir_if_not_exists(fs, prefix)
-            outfile = compat.guid() + ".parquet"
-            full_path = "/".join([prefix, outfile])
+            outfile = guid() + '.parquet'
+            full_path = '/'.join([prefix, outfile])
             with fs.open(full_path, 'wb') as f:
                 write_table(subtable, f, **kwargs)
     else:
-        outfile = compat.guid() + ".parquet"
-        full_path = "/".join([root_path, outfile])
+        outfile = guid() + '.parquet'
+        full_path = '/'.join([root_path, outfile])
         with fs.open(full_path, 'wb') as f:
             write_table(table, f, **kwargs)
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 513da28..4780eff 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -139,10 +139,8 @@ cdef class ChunkedArray:
 
         return result
 
-    def to_pandas(self,
-                  c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False,
-                  c_bool integer_object_nulls=False):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False):
         """
         Convert the arrow::ChunkedArray to an array object suitable for use
         in pandas
@@ -411,7 +409,7 @@ cdef class Column:
     def from_array(*args):
         return column(*args)
 
-    def cast(self, object target_type, safe=True):
+    def cast(self, object target_type, bint safe=True):
         """
         Cast column values to another data type
 
@@ -427,16 +425,11 @@ cdef class Column:
         casted : Column
         """
         cdef:
-            CCastOptions options
+            CCastOptions options = CCastOptions(safe)
+            DataType type = _ensure_type(target_type)
             shared_ptr[CArray] result
-            DataType type
             CDatum out
 
-        type = _ensure_type(target_type)
-
-        options.allow_int_overflow = not safe
-        options.allow_time_truncate = not safe
-
         with nogil:
             check_status(Cast(_context(), CDatum(self.column.data()),
                               type.sp_type, options, &out))
@@ -489,10 +482,8 @@ cdef class Column:
 
         return [pyarrow_wrap_column(col) for col in flattened]
 
-    def to_pandas(self,
-                  c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False,
-                  c_bool integer_object_nulls=False):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -863,7 +854,7 @@ cdef class RecordBatch:
             entries.append((name, column))
         return OrderedDict(entries)
 
-    def to_pandas(self, use_threads=True):
+    def to_pandas(self, bint use_threads=True):
         """
         Convert the arrow::RecordBatch to a pandas DataFrame
 
@@ -1089,7 +1080,7 @@ cdef class Table:
 
     @classmethod
     def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
-                    nthreads=None, columns=None):
+                    nthreads=None, columns=None, bint safe=True):
         """
         Convert pandas.DataFrame to an Arrow Table.
 
@@ -1120,7 +1111,8 @@ cdef class Table:
             indicated number of threads
         columns : list, optional
            List of column to be converted. If None, use all columns.
-
+        safe : boolean, default True
+           Check for overflows or other unsafe conversions
 
         Returns
         -------
@@ -1143,7 +1135,8 @@ cdef class Table:
             schema=schema,
             preserve_index=preserve_index,
             nthreads=nthreads,
-            columns=columns
+            columns=columns,
+            safe=safe
         )
         return cls.from_arrays(arrays, names=names, metadata=metadata)
 
@@ -1291,9 +1284,9 @@ cdef class Table:
 
         return result
 
-    def to_pandas(self, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False, categories=None,
-                  integer_object_nulls=False, use_threads=True):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  memory_pool=None, bint zero_copy_only=False, categories=None,
+                  bint integer_object_nulls=False, bint use_threads=True):
         """
         Convert the arrow::Table to a pandas DataFrame
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 4f65547..3fa7cf4 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -530,7 +530,7 @@ class TestConvertPrimitiveTypes(object):
         # ARROW-2135
         df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
         schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
-        table = pa.Table.from_pandas(df, schema=schema)
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())
 
@@ -2056,6 +2056,24 @@ class TestConvertMisc(object):
         expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
         _check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)
 
+    def test_safe_unsafe_casts(self):
+        # ARROW-2799
+        df = pd.DataFrame({
+            'A': list('abc'),
+            'B': np.linspace(0, 1, 3)
+        })
+
+        schema = pa.schema([
+            pa.field('A', pa.string()),
+            pa.field('B', pa.int32())
+        ])
+
+        with pytest.raises(ValueError):
+            pa.Table.from_pandas(df, schema=schema)
+
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        assert table.column('B').type == pa.int32()
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 556b155..64fd82d 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -643,7 +643,7 @@ def test_parquet_column_statistics_api(data, type, 
physical_type, min_value,
                                        distinct_count):
     df = pd.DataFrame({'data': data})
     schema = pa.schema([pa.field('data', type)])
-    table = pa.Table.from_pandas(df, schema=schema)
+    table = pa.Table.from_pandas(df, schema=schema, safe=False)
     fileh = make_sample_file(table)
 
     meta = fileh.metadata
@@ -1812,7 +1812,7 @@ def _test_write_to_dataset_with_partitions(base_path,
                                                 dtype='datetime64[D]')})
     cols = output_df.columns.tolist()
     partition_by = ['group1', 'group2']
-    output_table = pa.Table.from_pandas(output_df, schema=schema)
+    output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False)
     pq.write_to_dataset(output_table, base_path, partition_by,
                         filesystem=filesystem)

[arrow] branch master updated: ARROW-2799: [Python] Add safe option to Table.from_pandas to avoid unsafe casts

Reply via email to