This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e751999 ARROW-2799: [Python] Add safe option to Table.from_pandas to
avoid unsafe casts
e751999 is described below
commit e751999ba8f76a966d57c9226279ea05de26112b
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Sat Sep 8 14:18:53 2018 -0400
ARROW-2799: [Python] Add safe option to Table.from_pandas to avoid unsafe
casts
Depends on both #2497 and #2503
Author: Krisztián Szűcs <[email protected]>
Closes #2504 from kszucs/ARROW-2799 and squashes the following commits:
7bf9efd36 <Krisztián Szűcs> unsafe table creation during parquet dataset
partitioning
38dfea3e6 <Krisztián Szűcs> disallow float truncation by default
515a3935d <Krisztián Szűcs> Table.from_pandas safe option
2c8207400 <Krisztián Szűcs> check-format
80e14784a <Krisztián Szűcs> lint
e0838ceb2 <Krisztián Szűcs> wire CastOptions through the API
296109425 <Krisztián Szűcs> set allow_float_truncate true by default
194e4764a <Krisztián Szűcs> allow truncate float option and its
implementation
---
cpp/src/arrow/compute/kernels/cast.h | 2 +-
python/pyarrow/array.pxi | 14 +++--------
python/pyarrow/includes/libarrow.pxd | 5 ++++
python/pyarrow/pandas_compat.py | 5 ++--
python/pyarrow/parquet.py | 33 ++++++++++++------------
python/pyarrow/table.pxi | 39 ++++++++++++-----------------
python/pyarrow/tests/test_convert_pandas.py | 20 ++++++++++++++-
python/pyarrow/tests/test_parquet.py | 4 +--
8 files changed, 65 insertions(+), 57 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/cast.h
b/cpp/src/arrow/compute/kernels/cast.h
index 8392c18..49b12b9 100644
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ b/cpp/src/arrow/compute/kernels/cast.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
CastOptions()
: allow_int_overflow(false),
allow_time_truncate(false),
- allow_float_truncate(true) {}
+ allow_float_truncate(false) {}
explicit CastOptions(bool safe)
: allow_int_overflow(!safe),
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f9a16a3..362ebc6 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -53,10 +53,7 @@ cdef _ndarray_to_array(object values, object mask, DataType
type,
cdef:
shared_ptr[CChunkedArray] chunked_out
shared_ptr[CDataType] c_type
- CCastOptions cast_options
-
- cast_options.allow_int_overflow = not safe
- cast_options.allow_time_truncate = not safe
+ CCastOptions cast_options = CCastOptions(safe)
dtype = values.dtype
@@ -406,14 +403,9 @@ cdef class Array:
casted : Array
"""
cdef:
- CCastOptions options
+ CCastOptions options = CCastOptions(safe)
+ DataType type = _ensure_type(target_type)
shared_ptr[CArray] result
- DataType type
-
- type = _ensure_type(target_type)
-
- options.allow_int_overflow = not safe
- options.allow_time_truncate = not safe
with nogil:
check_status(Cast(_context(), self.ap[0], type.sp_type,
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 8bbbfcf..8a91bf5 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -903,8 +903,13 @@ cdef extern from "arrow/compute/api.h" namespace
"arrow::compute" nogil:
CFunctionContext(CMemoryPool* pool)
cdef cppclass CCastOptions" arrow::compute::CastOptions":
+ CCastOptions()
+ CCastOptions(c_bool safe)
+ CCastOptions Safe()
+ CCastOptions Unsafe()
c_bool allow_int_overflow
c_bool allow_time_truncate
+ c_bool allow_float_truncate
enum DatumType" arrow::compute::Datum::type":
DatumType_NONE" arrow::compute::Datum::NONE"
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5ba1702..6a43fe2 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,7 +316,8 @@ def _index_level_name(index, i, column_names):
return '__index_level_{:d}__'.format(i)
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
+ safe=True):
if columns is None:
columns = df.columns
column_names = []
@@ -366,7 +367,7 @@ def dataframe_to_arrays(df, schema, preserve_index,
nthreads=1, columns=None):
def convert_column(col, ty):
try:
- return pa.array(col, from_pandas=True, type=ty)
+ return pa.array(col, type=ty, from_pandas=True, safe=safe)
except (pa.ArrowInvalid,
pa.ArrowNotImplementedError,
pa.ArrowTypeError) as e:
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 6c2539c..9fa97b4 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -24,12 +24,14 @@ import re
import numpy as np
import pyarrow as pa
-import pyarrow._parquet as _parquet
import pyarrow.lib as lib
+import pyarrow._parquet as _parquet
+
from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa
FileMetaData, RowGroupMetaData,
ColumnChunkMetaData,
ParquetSchema, ColumnSchema)
+from pyarrow.compat import guid
from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem,
_get_fs_from_path)
from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads
@@ -53,6 +55,7 @@ class ParquetFile(object):
Will be used in reads for pandas schema metadata if not found in the
main file's metadata, no other uses at the moment
"""
+
def __init__(self, source, metadata=None, common_metadata=None):
self.reader = ParquetReader()
self.reader.open(source, metadata=metadata)
@@ -1124,11 +1127,6 @@ def write_to_dataset(table, root_path,
partition_cols=None,
Parameter for instantiating Table; preserve pandas index or not.
**kwargs : dict, kwargs for write_table function.
"""
- from pyarrow import (
- Table,
- compat
- )
-
if filesystem is None:
fs = _get_fs_from_path(root_path)
else:
@@ -1142,7 +1140,7 @@ def write_to_dataset(table, root_path,
partition_cols=None,
data_df = df.drop(partition_cols, axis='columns')
data_cols = df.columns.drop(partition_cols)
if len(data_cols) == 0:
- raise ValueError("No data left to save outside partition columns")
+ raise ValueError('No data left to save outside partition columns')
subschema = table.schema
# ARROW-2891: Ensure the output_schema is preserved when writing a
# partitioned dataset
@@ -1152,21 +1150,22 @@ def write_to_dataset(table, root_path,
partition_cols=None,
for keys, subgroup in data_df.groupby(partition_keys):
if not isinstance(keys, tuple):
keys = (keys,)
- subdir = "/".join(
- ["{colname}={value}".format(colname=name, value=val)
+ subdir = '/'.join(
+ ['{colname}={value}'.format(colname=name, value=val)
for name, val in zip(partition_cols, keys)])
- subtable = Table.from_pandas(subgroup,
- preserve_index=preserve_index,
- schema=subschema)
- prefix = "/".join([root_path, subdir])
+ subtable = pa.Table.from_pandas(subgroup,
+ preserve_index=preserve_index,
+ schema=subschema,
+ safe=False)
+ prefix = '/'.join([root_path, subdir])
_mkdir_if_not_exists(fs, prefix)
- outfile = compat.guid() + ".parquet"
- full_path = "/".join([prefix, outfile])
+ outfile = guid() + '.parquet'
+ full_path = '/'.join([prefix, outfile])
with fs.open(full_path, 'wb') as f:
write_table(subtable, f, **kwargs)
else:
- outfile = compat.guid() + ".parquet"
- full_path = "/".join([root_path, outfile])
+ outfile = guid() + '.parquet'
+ full_path = '/'.join([root_path, outfile])
with fs.open(full_path, 'wb') as f:
write_table(table, f, **kwargs)
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 513da28..4780eff 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -139,10 +139,8 @@ cdef class ChunkedArray:
return result
- def to_pandas(self,
- c_bool strings_to_categorical=False,
- c_bool zero_copy_only=False,
- c_bool integer_object_nulls=False):
+ def to_pandas(self, bint strings_to_categorical=False,
+ bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert the arrow::ChunkedArray to an array object suitable for use
in pandas
@@ -411,7 +409,7 @@ cdef class Column:
def from_array(*args):
return column(*args)
- def cast(self, object target_type, safe=True):
+ def cast(self, object target_type, bint safe=True):
"""
Cast column values to another data type
@@ -427,16 +425,11 @@ cdef class Column:
casted : Column
"""
cdef:
- CCastOptions options
+ CCastOptions options = CCastOptions(safe)
+ DataType type = _ensure_type(target_type)
shared_ptr[CArray] result
- DataType type
CDatum out
- type = _ensure_type(target_type)
-
- options.allow_int_overflow = not safe
- options.allow_time_truncate = not safe
-
with nogil:
check_status(Cast(_context(), CDatum(self.column.data()),
type.sp_type, options, &out))
@@ -489,10 +482,8 @@ cdef class Column:
return [pyarrow_wrap_column(col) for col in flattened]
- def to_pandas(self,
- c_bool strings_to_categorical=False,
- c_bool zero_copy_only=False,
- c_bool integer_object_nulls=False):
+ def to_pandas(self, bint strings_to_categorical=False,
+ bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert the arrow::Column to a pandas.Series
@@ -863,7 +854,7 @@ cdef class RecordBatch:
entries.append((name, column))
return OrderedDict(entries)
- def to_pandas(self, use_threads=True):
+ def to_pandas(self, bint use_threads=True):
"""
Convert the arrow::RecordBatch to a pandas DataFrame
@@ -1089,7 +1080,7 @@ cdef class Table:
@classmethod
def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
- nthreads=None, columns=None):
+ nthreads=None, columns=None, bint safe=True):
"""
Convert pandas.DataFrame to an Arrow Table.
@@ -1120,7 +1111,8 @@ cdef class Table:
indicated number of threads
columns : list, optional
List of column to be converted. If None, use all columns.
-
+ safe : boolean, default True
+ Check for overflows or other unsafe conversions
Returns
-------
@@ -1143,7 +1135,8 @@ cdef class Table:
schema=schema,
preserve_index=preserve_index,
nthreads=nthreads,
- columns=columns
+ columns=columns,
+ safe=safe
)
return cls.from_arrays(arrays, names=names, metadata=metadata)
@@ -1291,9 +1284,9 @@ cdef class Table:
return result
- def to_pandas(self, strings_to_categorical=False,
- memory_pool=None, zero_copy_only=False, categories=None,
- integer_object_nulls=False, use_threads=True):
+ def to_pandas(self, bint strings_to_categorical=False,
+ memory_pool=None, bint zero_copy_only=False, categories=None,
+ bint integer_object_nulls=False, bint use_threads=True):
"""
Convert the arrow::Table to a pandas DataFrame
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 4f65547..3fa7cf4 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -530,7 +530,7 @@ class TestConvertPrimitiveTypes(object):
# ARROW-2135
df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
- table = pa.Table.from_pandas(df, schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema, safe=False)
assert table[0].to_pylist() == [1, 2, None]
tm.assert_frame_equal(df, table.to_pandas())
@@ -2056,6 +2056,24 @@ class TestConvertMisc(object):
expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
_check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)
+ def test_safe_unsafe_casts(self):
+ # ARROW-2799
+ df = pd.DataFrame({
+ 'A': list('abc'),
+ 'B': np.linspace(0, 1, 3)
+ })
+
+ schema = pa.schema([
+ pa.field('A', pa.string()),
+ pa.field('B', pa.int32())
+ ])
+
+ with pytest.raises(ValueError):
+ pa.Table.from_pandas(df, schema=schema)
+
+ table = pa.Table.from_pandas(df, schema=schema, safe=False)
+ assert table.column('B').type == pa.int32()
+
def _fully_loaded_dataframe_example():
from distutils.version import LooseVersion
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 556b155..64fd82d 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -643,7 +643,7 @@ def test_parquet_column_statistics_api(data, type,
physical_type, min_value,
distinct_count):
df = pd.DataFrame({'data': data})
schema = pa.schema([pa.field('data', type)])
- table = pa.Table.from_pandas(df, schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema, safe=False)
fileh = make_sample_file(table)
meta = fileh.metadata
@@ -1812,7 +1812,7 @@ def _test_write_to_dataset_with_partitions(base_path,
dtype='datetime64[D]')})
cols = output_df.columns.tolist()
partition_by = ['group1', 'group2']
- output_table = pa.Table.from_pandas(output_df, schema=schema)
+ output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False)
pq.write_to_dataset(output_table, base_path, partition_by,
filesystem=filesystem)