This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4f56aba35b GH-33321: [Python] Support converting to non-nano
datetime64 for pandas >= 2.0 (#35656)
4f56aba35b is described below
commit 4f56aba35bced815453fdb18121000f4b875ef24
Author: Dane Pitkin <[email protected]>
AuthorDate: Fri Jul 7 10:45:58 2023 -0400
GH-33321: [Python] Support converting to non-nano datetime64 for pandas >=
2.0 (#35656)
Do not coerce temporal types to nanosecond when pandas >= 2.0 is imported,
since pandas now supports s/ms/us time units.
This PR adds support for the following Arrow -> Pandas conversions, which
previously all defaulted to `datetime64[ns]` or `datetime64[ns, <TZ>]`:
```
date32 -> datetime64[ms]
date64 -> datetime64[ms]
datetime64[s] -> datetime64[s]
datetime64[ms] -> datetime64[ms]
datetime64[us] -> datetime64[us]
datetime64[s, <TZ>] -> datetime64[s, <TZ>]
datetime64[ms, <TZ>] -> datetime64[ms, <TZ>]
datetime64[us, <TZ>] -> datetime64[us, <TZ>]
```
### Rationale for this change
Pandas 2.0 introduces proper support for temporal types.
### Are these changes tested?
Yes. Pytests added and updated.
### Are there any user-facing changes?
Yes, arrow-to-pandas default conversion behavior will change when users
have pandas >= 2.0, but a legacy option is exposed to provide backwards
compatibility.
* Closes: #33321
Lead-authored-by: Dane Pitkin <[email protected]>
Co-authored-by: Dane Pitkin <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
python/pyarrow/array.pxi | 31 +++-
python/pyarrow/includes/libarrow_python.pxd | 1 +
python/pyarrow/pandas-shim.pxi | 8 +
python/pyarrow/pandas_compat.py | 9 +-
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 143 +++++++++++----
python/pyarrow/src/arrow/python/arrow_to_pandas.h | 4 +
python/pyarrow/table.pxi | 7 +-
python/pyarrow/tests/parquet/common.py | 15 +-
python/pyarrow/tests/parquet/test_data_types.py | 2 +-
python/pyarrow/tests/parquet/test_dataset.py | 36 ++--
python/pyarrow/tests/parquet/test_datetime.py | 10 +-
python/pyarrow/tests/parquet/test_pandas.py | 13 +-
python/pyarrow/tests/parquet/test_parquet_file.py | 4 +-
python/pyarrow/tests/test_array.py | 10 +-
python/pyarrow/tests/test_pandas.py | 201 ++++++++++++++-------
python/pyarrow/tests/test_schema.py | 16 +-
python/pyarrow/types.pxi | 81 ++++++---
17 files changed, 416 insertions(+), 175 deletions(-)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index b704da7360..2f8959cd72 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable):
bint split_blocks=False,
bint self_destruct=False,
str maps_as_pydicts=None,
- types_mapper=None
+ types_mapper=None,
+ bint coerce_temporal_nanoseconds=False
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable):
integer_object_nulls : bool, default False
Cast integers with nulls to objects
date_as_object : bool, default True
- Cast dates to objects. If False, convert to datetime64[ns] dtype.
+ Cast dates to objects. If False, convert to datetime64 dtype with
+ the equivalent time unit (if supported). Note: in pandas version
+ < 2.0, only datetime64[ns] conversion is supported.
timestamp_as_object : bool, default False
Cast non-nanosecond timestamps (np.datetime64) to objects. This is
- useful if you have timestamps that don't fit in the normal date
- range of nanosecond timestamps (1678 CE-2262 CE).
- If False, all timestamps are converted to datetime64[ns] dtype.
+ useful in pandas version 1.x if you have timestamps that don't fit
+ in the normal date range of nanosecond timestamps (1678 CE-2262
CE).
+ Non-nanosecond timestamps are supported in pandas version 2.0.
+ If False, all timestamps are converted to datetime64 dtype.
use_threads : bool, default True
Whether to parallelize the conversion using multiple threads.
deduplicate_objects : bool, default True
@@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable):
expected to return a pandas ExtensionDtype or ``None`` if the
default conversion should be used for that type. If you have
a dictionary mapping, you can pass ``dict.get`` as function.
+ coerce_temporal_nanoseconds : bool, default False
+ Only applicable to pandas version >= 2.0.
+ A legacy option to coerce date32, date64, duration, and timestamp
+ time units to nanoseconds when converting to pandas. This is the
+ default behavior in pandas version 1.x. Set this option to True if
+ you'd like to use this coercion when using pandas version >= 2.0
+ for backwards compatibility (not recommended otherwise).
Returns
-------
@@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable):
safe=safe,
split_blocks=split_blocks,
self_destruct=self_destruct,
- maps_as_pydicts=maps_as_pydicts
+ maps_as_pydicts=maps_as_pydicts,
+ coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata,
@@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.safe_cast = options['safe']
result.split_blocks = options['split_blocks']
result.self_destruct = options['self_destruct']
+ result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
maps_as_pydicts = options['maps_as_pydicts']
@@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible):
# so it can't be done if the user requested a zero_copy.
c_options.decode_dictionaries = not zero_copy_only
c_options.zero_copy_only = zero_copy_only
+ c_options.to_numpy = True
with nogil:
check_status(ConvertArrayToPandas(c_options, self.sp_array,
@@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
arr = dtype.__from_arrow__(obj)
return pandas_api.series(arr, name=name, copy=False)
- # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
- c_options.coerce_temporal_nanoseconds = True
+ if pandas_api.is_v1():
+ # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+ c_options.coerce_temporal_nanoseconds = True
if isinstance(obj, Array):
with nogil:
diff --git a/python/pyarrow/includes/libarrow_python.pxd
b/python/pyarrow/includes/libarrow_python.pxd
index 2052600c9f..f08fcaa40d 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py"
nogil:
c_bool decode_dictionaries
unordered_set[c_string] categorical_columns
unordered_set[c_string] extension_columns
+ c_bool to_numpy
cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
shared_ptr[CRecordBatch] batch
diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 7dc5d590a7..a0c0cabf6d 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype
bint has_sparse
bint _pd024
+ bint _is_v1
def __init__(self):
self._tried_importing_pandas = False
@@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object):
self._pd = pd
self._version = pd.__version__
self._loose_version = Version(pd.__version__)
+ self._is_v1 = False
if self._loose_version < Version('1.0.0'):
self._have_pandas = False
@@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object):
"installed. Therefore, pandas-specific integration is not "
"used.".format(self._version), stacklevel=2)
return
+ elif self._loose_version < Version('2.0.0'):
+ self._is_v1 = True
self._compat_module = pdcompat
self._data_frame = pd.DataFrame
@@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._version
+ def is_v1(self):
+ self._check_import()
+ return self._is_v1
+
@property
def categorical_type(self):
self._check_import()
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5369677e87..12f1cc4312 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None,
extension_columns=None):
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
- dtype = make_datetimetz(item['timezone'])
+ unit, _ = np.datetime_data(block_arr.dtype)
+ dtype = make_datetimetz(unit, item['timezone'])
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
@@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None,
extension_columns=None):
return block
-def make_datetimetz(tz):
+def make_datetimetz(unit, tz):
+ if _pandas_api.is_v1():
+ unit = 'ns' # ARROW-3789: Coerce date/timestamp types to
datetime64[ns]
tz = pa.lib.string_to_tzinfo(tz)
- return _pandas_api.datetimetz_type('ns', tz=tz)
+ return _pandas_api.datetimetz_type(unit, tz=tz)
def table_to_blockmanager(options, table, categories=None,
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index 2cd6f5c26d..91c7b8a457 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -343,6 +343,9 @@ class PandasWriter {
DATETIME_MILLI,
DATETIME_MICRO,
DATETIME_NANO,
+ DATETIME_SECOND_TZ,
+ DATETIME_MILLI_TZ,
+ DATETIME_MICRO_TZ,
DATETIME_NANO_TZ,
TIMEDELTA_SECOND,
TIMEDELTA_MILLI,
@@ -1488,7 +1491,7 @@ class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
// Date / timestamp types
template <typename T, int64_t SHIFT>
-inline void ConvertDatetimeLikeNanos(const ChunkedArray& data, int64_t*
out_values) {
+inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = *data.chunk(c);
const T* in_values = GetPrimitiveValues<T>(arr);
@@ -1570,7 +1573,30 @@ class DatetimeWriter : public
TypedPandasWriter<NPY_DATETIME> {
};
using DatetimeSecondWriter = DatetimeWriter<TimeUnit::SECOND>;
-using DatetimeMilliWriter = DatetimeWriter<TimeUnit::MILLI>;
+
+class DatetimeMilliWriter : public DatetimeWriter<TimeUnit::MILLI> {
+ public:
+ using DatetimeWriter<TimeUnit::MILLI>::DatetimeWriter;
+
+ Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement)
override {
+ Type::type type = data->type()->id();
+ int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+ if (type == Type::DATE32) {
+ // Convert from days since epoch to datetime64[ms]
+ ConvertDatetime<int32_t, 86400000L>(*data, out_values);
+ } else if (type == Type::DATE64) {
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+ } else {
+ const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+ DCHECK_EQ(TimeUnit::MILLI, ts_type.unit())
+ << "Should only call instances of this writer "
+ << "with arrays of the correct unit";
+ ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+ }
+ return Status::OK();
+ }
+};
+
using DatetimeMicroWriter = DatetimeWriter<TimeUnit::MICRO>;
class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
@@ -1592,11 +1618,11 @@ class DatetimeNanoWriter : public
DatetimeWriter<TimeUnit::NANO> {
if (type == Type::DATE32) {
// Convert from days since epoch to datetime64[ns]
- ConvertDatetimeLikeNanos<int32_t, kNanosecondsInDay>(*data, out_values);
+ ConvertDatetime<int32_t, kNanosecondsInDay>(*data, out_values);
} else if (type == Type::DATE64) {
// Date64Type is millisecond timestamp stored as int64_t
// TODO(wesm): Do we want to make sure to zero out the milliseconds?
- ConvertDatetimeLikeNanos<int64_t, 1000000L>(*data, out_values);
+ ConvertDatetime<int64_t, 1000000L>(*data, out_values);
} else if (type == Type::TIMESTAMP) {
const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
@@ -1619,16 +1645,17 @@ class DatetimeNanoWriter : public
DatetimeWriter<TimeUnit::NANO> {
}
};
-class DatetimeTZWriter : public DatetimeNanoWriter {
+template <typename BASE>
+class DatetimeTZWriter : public BASE {
public:
DatetimeTZWriter(const PandasOptions& options, const std::string& timezone,
int64_t num_rows)
- : DatetimeNanoWriter(options, num_rows, 1), timezone_(timezone) {}
+ : BASE(options, num_rows, 1), timezone_(timezone) {}
protected:
Status GetResultBlock(PyObject** out) override {
- RETURN_NOT_OK(MakeBlock1D());
- *out = block_arr_.obj();
+ RETURN_NOT_OK(this->MakeBlock1D());
+ *out = this->block_arr_.obj();
return Status::OK();
}
@@ -1645,6 +1672,11 @@ class DatetimeTZWriter : public DatetimeNanoWriter {
std::string timezone_;
};
+using DatetimeSecondTZWriter = DatetimeTZWriter<DatetimeSecondWriter>;
+using DatetimeMilliTZWriter = DatetimeTZWriter<DatetimeMilliWriter>;
+using DatetimeMicroTZWriter = DatetimeTZWriter<DatetimeMicroWriter>;
+using DatetimeNanoTZWriter = DatetimeTZWriter<DatetimeNanoWriter>;
+
template <TimeUnit::type UNIT>
class TimedeltaWriter : public TypedPandasWriter<NPY_TIMEDELTA> {
public:
@@ -1690,11 +1722,11 @@ class TimedeltaNanoWriter : public
TimedeltaWriter<TimeUnit::NANO> {
if (ts_type.unit() == TimeUnit::NANO) {
ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull,
out_values);
} else if (ts_type.unit() == TimeUnit::MICRO) {
- ConvertDatetimeLikeNanos<int64_t, 1000L>(*data, out_values);
+ ConvertDatetime<int64_t, 1000L>(*data, out_values);
} else if (ts_type.unit() == TimeUnit::MILLI) {
- ConvertDatetimeLikeNanos<int64_t, 1000000L>(*data, out_values);
+ ConvertDatetime<int64_t, 1000000L>(*data, out_values);
} else if (ts_type.unit() == TimeUnit::SECOND) {
- ConvertDatetimeLikeNanos<int64_t, 1000000000L>(*data, out_values);
+ ConvertDatetime<int64_t, 1000000000L>(*data, out_values);
} else {
return Status::NotImplemented("Unsupported time unit");
}
@@ -1945,6 +1977,12 @@ Status MakeWriter(const PandasOptions& options,
PandasWriter::type writer_type,
*writer = std::make_shared<CategoricalWriter<TYPE>>(options, num_rows); \
break;
+#define TZ_CASE(NAME, TYPE) \
+ case PandasWriter::NAME: { \
+ const auto& ts_type = checked_cast<const TimestampType&>(type); \
+ *writer = std::make_shared<TYPE>(options, ts_type.timezone(), num_rows); \
+ } break;
+
switch (writer_type) {
case PandasWriter::CATEGORICAL: {
const auto& index_type = *checked_cast<const
DictionaryType&>(type).index_type();
@@ -1991,10 +2029,10 @@ Status MakeWriter(const PandasOptions& options,
PandasWriter::type writer_type,
BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter);
BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter);
BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter);
- case PandasWriter::DATETIME_NANO_TZ: {
- const auto& ts_type = checked_cast<const TimestampType&>(type);
- *writer = std::make_shared<DatetimeTZWriter>(options,
ts_type.timezone(), num_rows);
- } break;
+ TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter);
+ TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter);
+ TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter);
+ TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter);
default:
return Status::NotImplemented("Unsupported block type");
}
@@ -2057,13 +2095,25 @@ static Status GetPandasWriterType(const ChunkedArray&
data, const PandasOptions&
case Type::INTERVAL_MONTH_DAY_NANO: // fall through
*output_type = PandasWriter::OBJECT;
break;
- case Type::DATE32: // fall through
+ case Type::DATE32:
+ if (options.date_as_object) {
+ *output_type = PandasWriter::OBJECT;
+ } else if (options.coerce_temporal_nanoseconds) {
+ *output_type = PandasWriter::DATETIME_NANO;
+ } else if (options.to_numpy) {
+ // Numpy supports Day, but Pandas does not
+ *output_type = PandasWriter::DATETIME_DAY;
+ } else {
+ *output_type = PandasWriter::DATETIME_MILLI;
+ }
+ break;
case Type::DATE64:
if (options.date_as_object) {
*output_type = PandasWriter::OBJECT;
+ } else if (options.coerce_temporal_nanoseconds) {
+ *output_type = PandasWriter::DATETIME_NANO;
} else {
- *output_type = options.coerce_temporal_nanoseconds ?
PandasWriter::DATETIME_NANO
- :
PandasWriter::DATETIME_DAY;
+ *output_type = PandasWriter::DATETIME_MILLI;
}
break;
case Type::TIMESTAMP: {
@@ -2072,24 +2122,43 @@ static Status GetPandasWriterType(const ChunkedArray&
data, const PandasOptions&
// Nanoseconds are never out of bounds for pandas, so in that case
// we don't convert to object
*output_type = PandasWriter::OBJECT;
- } else if (!ts_type.timezone().empty()) {
- *output_type = PandasWriter::DATETIME_NANO_TZ;
} else if (options.coerce_temporal_nanoseconds) {
- *output_type = PandasWriter::DATETIME_NANO;
+ if (!ts_type.timezone().empty()) {
+ *output_type = PandasWriter::DATETIME_NANO_TZ;
+ } else {
+ *output_type = PandasWriter::DATETIME_NANO;
+ }
} else {
- switch (ts_type.unit()) {
- case TimeUnit::SECOND:
- *output_type = PandasWriter::DATETIME_SECOND;
- break;
- case TimeUnit::MILLI:
- *output_type = PandasWriter::DATETIME_MILLI;
- break;
- case TimeUnit::MICRO:
- *output_type = PandasWriter::DATETIME_MICRO;
- break;
- case TimeUnit::NANO:
- *output_type = PandasWriter::DATETIME_NANO;
- break;
+ if (!ts_type.timezone().empty()) {
+ switch (ts_type.unit()) {
+ case TimeUnit::SECOND:
+ *output_type = PandasWriter::DATETIME_SECOND_TZ;
+ break;
+ case TimeUnit::MILLI:
+ *output_type = PandasWriter::DATETIME_MILLI_TZ;
+ break;
+ case TimeUnit::MICRO:
+ *output_type = PandasWriter::DATETIME_MICRO_TZ;
+ break;
+ case TimeUnit::NANO:
+ *output_type = PandasWriter::DATETIME_NANO_TZ;
+ break;
+ }
+ } else {
+ switch (ts_type.unit()) {
+ case TimeUnit::SECOND:
+ *output_type = PandasWriter::DATETIME_SECOND;
+ break;
+ case TimeUnit::MILLI:
+ *output_type = PandasWriter::DATETIME_MILLI;
+ break;
+ case TimeUnit::MICRO:
+ *output_type = PandasWriter::DATETIME_MICRO;
+ break;
+ case TimeUnit::NANO:
+ *output_type = PandasWriter::DATETIME_NANO;
+ break;
+ }
}
}
} break;
@@ -2243,6 +2312,9 @@ class ConsolidatedBlockCreator : public
PandasBlockCreator {
int block_placement = 0;
std::shared_ptr<PandasWriter> writer;
if (output_type == PandasWriter::CATEGORICAL ||
+ output_type == PandasWriter::DATETIME_SECOND_TZ ||
+ output_type == PandasWriter::DATETIME_MILLI_TZ ||
+ output_type == PandasWriter::DATETIME_MICRO_TZ ||
output_type == PandasWriter::DATETIME_NANO_TZ ||
output_type == PandasWriter::EXTENSION) {
RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_,
@@ -2278,6 +2350,9 @@ class ConsolidatedBlockCreator : public
PandasBlockCreator {
PandasWriter::type output_type = this->column_types_[i];
switch (output_type) {
case PandasWriter::CATEGORICAL:
+ case PandasWriter::DATETIME_SECOND_TZ:
+ case PandasWriter::DATETIME_MILLI_TZ:
+ case PandasWriter::DATETIME_MICRO_TZ:
case PandasWriter::DATETIME_NANO_TZ:
case PandasWriter::EXTENSION: {
auto it = this->singleton_blocks_.find(i);
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
index 1da88961d3..82e0a60051 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -117,6 +117,10 @@ struct PandasOptions {
// Columns that should be passed through to be converted to
// ExtensionArray/Block
std::unordered_set<std::string> extension_columns;
+
+ // Used internally to decipher between to_numpy() and to_pandas() when
+ // the expected output differs
+ bool to_numpy = false;
};
ARROW_PYTHON_EXPORT
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index fd5ba263d2..238fdb86bc 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -480,6 +480,8 @@ cdef class ChunkedArray(_PandasConvertible):
PandasOptions c_options
object values
+ c_options.to_numpy = True
+
with nogil:
check_status(
ConvertChunkedArrayToPandas(
@@ -2981,8 +2983,9 @@ def table_to_blocks(options, Table table, categories,
extension_columns):
c_options.extension_columns = {tobytes(col)
for col in extension_columns}
- # ARROW-3789(wesm); Convert date/timestamp types to datetime64[ns]
- c_options.coerce_temporal_nanoseconds = True
+ if pandas_api.is_v1():
+ # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+ c_options.coerce_temporal_nanoseconds = True
if c_options.self_destruct:
# Move the shared_ptr, table is now unsafe to use further
diff --git a/python/pyarrow/tests/parquet/common.py
b/python/pyarrow/tests/parquet/common.py
index f4e609c6ff..4401d3ca6b 100644
--- a/python/pyarrow/tests/parquet/common.py
+++ b/python/pyarrow/tests/parquet/common.py
@@ -150,8 +150,7 @@ def make_sample_file(table_or_df):
a_table = pa.Table.from_pandas(table_or_df)
buf = io.BytesIO()
- _write_table(a_table, buf, compression='SNAPPY', version='2.6',
- coerce_timestamps='ms')
+ _write_table(a_table, buf, compression='SNAPPY', version='2.6')
buf.seek(0)
return pq.ParquetFile(buf)
@@ -173,11 +172,13 @@ def alltypes_sample(size=10000, seed=0,
categorical=False):
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
- # TODO(wesm): Test other timestamp resolutions now that arrow supports
- # them
- 'datetime': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]').astype('datetime64[ns]'),
- 'timedelta': np.arange(0, size, dtype="timedelta64[ns]"),
+ 'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
+ dtype='datetime64[ms]'),
+ 'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
+ dtype='datetime64[us]'),
+ 'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
+ dtype='datetime64[ns]'),
+ 'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
'str': pd.Series([str(x) for x in range(size)]),
'empty_str': [''] * size,
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
diff --git a/python/pyarrow/tests/parquet/test_data_types.py
b/python/pyarrow/tests/parquet/test_data_types.py
index 109d82831c..32fe128bba 100644
--- a/python/pyarrow/tests/parquet/test_data_types.py
+++ b/python/pyarrow/tests/parquet/test_data_types.py
@@ -64,7 +64,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size,
use_legacy_dataset):
assert arrow_table.schema.pandas_metadata is not None
_write_table(arrow_table, filename, version='2.6',
- coerce_timestamps='ms', chunk_size=chunk_size)
+ chunk_size=chunk_size)
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
assert table_read.schema.pandas_metadata is not None
diff --git a/python/pyarrow/tests/parquet/test_dataset.py
b/python/pyarrow/tests/parquet/test_dataset.py
index c9a0c63eb1..cd991617c9 100644
--- a/python/pyarrow/tests/parquet/test_dataset.py
+++ b/python/pyarrow/tests/parquet/test_dataset.py
@@ -1262,13 +1262,14 @@ def _test_write_to_dataset_with_partitions(base_path,
import pyarrow.parquet as pq
# ARROW-1400
- output_df = pd.DataFrame({'group1': list('aaabbbbccc'),
- 'group2': list('eefeffgeee'),
- 'num': list(range(10)),
- 'nan': [np.nan] * 10,
- 'date': np.arange('2017-01-01', '2017-01-11',
- dtype='datetime64[D]')})
- output_df["date"] = output_df["date"].astype('datetime64[ns]')
+ output_df = pd.DataFrame({
+ 'group1': list('aaabbbbccc'),
+ 'group2': list('eefeffgeee'),
+ 'num': list(range(10)),
+ 'nan': [np.nan] * 10,
+ 'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]').astype(
+ 'datetime64[ns]')
+ })
cols = output_df.columns.tolist()
partition_by = ['group1', 'group2']
output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False,
@@ -1313,6 +1314,11 @@ def _test_write_to_dataset_with_partitions(base_path,
# Partitioned columns become 'categorical' dtypes
for col in partition_by:
output_df[col] = output_df[col].astype('category')
+
+ if schema:
+ expected_date_type =
schema.field_by_name('date').type.to_pandas_dtype()
+ output_df["date"] = output_df["date"].astype(expected_date_type)
+
tm.assert_frame_equal(output_df, input_df)
@@ -1324,12 +1330,13 @@ def _test_write_to_dataset_no_partitions(base_path,
import pyarrow.parquet as pq
# ARROW-1400
- output_df = pd.DataFrame({'group1': list('aaabbbbccc'),
- 'group2': list('eefeffgeee'),
- 'num': list(range(10)),
- 'date': np.arange('2017-01-01', '2017-01-11',
- dtype='datetime64[D]')})
- output_df["date"] = output_df["date"].astype('datetime64[ns]')
+ output_df = pd.DataFrame({
+ 'group1': list('aaabbbbccc'),
+ 'group2': list('eefeffgeee'),
+ 'num': list(range(10)),
+ 'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]').astype(
+ 'datetime64[ns]')
+ })
cols = output_df.columns.tolist()
output_table = pa.Table.from_pandas(output_df)
@@ -1355,7 +1362,7 @@ def _test_write_to_dataset_no_partitions(base_path,
input_df = input_table.to_pandas()
input_df = input_df.drop_duplicates()
input_df = input_df[cols]
- assert output_df.equals(input_df)
+ tm.assert_frame_equal(output_df, input_df)
@pytest.mark.pandas
@@ -1458,7 +1465,6 @@ def
test_write_to_dataset_with_partitions_and_custom_filenames(
'nan': [np.nan] * 10,
'date': np.arange('2017-01-01', '2017-01-11',
dtype='datetime64[D]')})
- output_df["date"] = output_df["date"].astype('datetime64[ns]')
partition_by = ['group1', 'group2']
output_table = pa.Table.from_pandas(output_df)
path = str(tempdir)
diff --git a/python/pyarrow/tests/parquet/test_datetime.py
b/python/pyarrow/tests/parquet/test_datetime.py
index 1cad82b839..f97c451df7 100644
--- a/python/pyarrow/tests/parquet/test_datetime.py
+++ b/python/pyarrow/tests/parquet/test_datetime.py
@@ -50,9 +50,11 @@ pytestmark = pytest.mark.parquet
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_pandas_parquet_datetime_tz(use_legacy_dataset):
- s = pd.Series([datetime.datetime(2017, 9, 6)])
+ # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+ # so we need to cast the pandas dtype. Pandas v1 will always silently
+ # coerce to [ns] due to lack of non-[ns] support.
+ s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
s = s.dt.tz_localize('utc')
-
s.index = s
# Both a column and an index to hit both use cases
@@ -64,7 +66,7 @@ def test_pandas_parquet_datetime_tz(use_legacy_dataset):
arrow_table = pa.Table.from_pandas(df)
- _write_table(arrow_table, f, coerce_timestamps='ms')
+ _write_table(arrow_table, f)
f.seek(0)
table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset)
@@ -153,7 +155,7 @@ def test_coerce_timestamps_truncated(tempdir):
df_ms = table_ms.to_pandas()
arrays_expected = {'datetime64': [dt_ms, dt_ms]}
- df_expected = pd.DataFrame(arrays_expected)
+ df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]')
tm.assert_frame_equal(df_expected, df_ms)
diff --git a/python/pyarrow/tests/parquet/test_pandas.py
b/python/pyarrow/tests/parquet/test_pandas.py
index 6bd68e08fc..0ed305bff1 100644
--- a/python/pyarrow/tests/parquet/test_pandas.py
+++ b/python/pyarrow/tests/parquet/test_pandas.py
@@ -59,7 +59,7 @@ def test_pandas_parquet_custom_metadata(tempdir):
arrow_table = pa.Table.from_pandas(df)
assert b'pandas' in arrow_table.schema.metadata
- _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+ _write_table(arrow_table, filename)
metadata = pq.read_metadata(filename).metadata
assert b'pandas' in metadata
@@ -113,7 +113,7 @@ def test_pandas_parquet_column_multiindex(tempdir,
use_legacy_dataset):
arrow_table = pa.Table.from_pandas(df)
assert arrow_table.schema.pandas_metadata is not None
- _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+ _write_table(arrow_table, filename)
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
@@ -136,7 +136,7 @@ def
test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written(
# While index_columns should be empty, columns needs to be filled still.
assert js['columns']
- _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+ _write_table(arrow_table, filename)
table_read = pq.read_pandas(
filename, use_legacy_dataset=use_legacy_dataset)
@@ -344,7 +344,12 @@ def test_index_column_name_duplicate(tempdir,
use_legacy_dataset):
}
}
path = str(tempdir / 'data.parquet')
- dfx = pd.DataFrame(data).set_index('time', drop=False)
+
+ # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+ # so we need to cast the pandas dtype. Pandas v1 will always silently
+ # coerce to [ns] due to lack of non-[ns] support.
+ dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time',
drop=False)
+
tdfx = pa.Table.from_pandas(dfx)
_write_table(tdfx, path)
arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset)
diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py
b/python/pyarrow/tests/parquet/test_parquet_file.py
index bd468949a8..9f920206a1 100644
--- a/python/pyarrow/tests/parquet/test_parquet_file.py
+++ b/python/pyarrow/tests/parquet/test_parquet_file.py
@@ -210,7 +210,7 @@ def test_iter_batches_columns_reader(tempdir, batch_size):
filename = tempdir / 'pandas_roundtrip.parquet'
arrow_table = pa.Table.from_pandas(df)
_write_table(arrow_table, filename, version='2.6',
- coerce_timestamps='ms', chunk_size=chunk_size)
+ chunk_size=chunk_size)
file_ = pq.ParquetFile(filename)
for columns in [df.columns[:10], df.columns[10:]]:
@@ -234,7 +234,7 @@ def test_iter_batches_reader(tempdir, chunk_size):
assert arrow_table.schema.pandas_metadata is not None
_write_table(arrow_table, filename, version='2.6',
- coerce_timestamps='ms', chunk_size=chunk_size)
+ chunk_size=chunk_size)
file_ = pq.ParquetFile(filename)
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index a9e8f09d1b..ed29bf5cae 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -223,8 +223,9 @@ def test_to_numpy_writable():
@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
-def test_to_numpy_datetime64(unit):
- arr = pa.array([1, 2, 3], pa.timestamp(unit))
[email protected]('tz', [None, "UTC"])
+def test_to_numpy_datetime64(unit, tz):
+ arr = pa.array([1, 2, 3], pa.timestamp(unit, tz=tz))
expected = np.array([1, 2, 3], dtype="datetime64[{}]".format(unit))
np_arr = arr.to_numpy()
np.testing.assert_array_equal(np_arr, expected)
@@ -2165,12 +2166,15 @@ def test_pandas_null_sentinels_index():
assert result.equals(expected)
-def test_array_from_numpy_datetimeD():
+def test_array_roundtrip_from_numpy_datetimeD():
arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')
result = pa.array(arr)
expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32())
assert result.equals(expected)
+ result = result.to_numpy(zero_copy_only=False)
+ np.testing.assert_array_equal(result, arr)
+ assert result.dtype == arr.dtype
def test_array_from_naive_datetimes():
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index d6ff52d3e0..cf71b8b82d 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -67,10 +67,18 @@ def _alltypes_example(size=100):
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
- # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms,
- # us, ns
- 'datetime': np.arange("2016-01-01T00:00:00.001", size,
- dtype='datetime64[ms]').astype("datetime64[ns]"),
+ 'datetime[s]': np.arange("2016-01-01T00:00:00.001", size,
+ dtype='datetime64[s]'),
+ 'datetime[ms]': np.arange("2016-01-01T00:00:00.001", size,
+ dtype='datetime64[ms]'),
+ 'datetime[us]': np.arange("2016-01-01T00:00:00.001", size,
+ dtype='datetime64[us]'),
+ 'datetime[ns]': np.arange("2016-01-01T00:00:00.001", size,
+ dtype='datetime64[ns]'),
+ 'timedelta64[s]': np.arange(0, size, dtype='timedelta64[s]'),
+ 'timedelta64[ms]': np.arange(0, size, dtype='timedelta64[ms]'),
+ 'timedelta64[us]': np.arange(0, size, dtype='timedelta64[us]'),
+ 'timedelta64[ns]': np.arange(0, size, dtype='timedelta64[ns]'),
'str': [str(x) for x in range(size)],
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'empty_str': [''] * size
@@ -1017,27 +1025,30 @@ class TestConvertDateTimeLikeTypes:
expected_schema=schema,
)
- def test_timestamps_with_timezone(self):
+ @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+ def test_timestamps_with_timezone(self, unit):
+ if Version(pd.__version__) < Version("2.0.0") and unit != 'ns':
+ pytest.skip("pandas < 2.0 only supports nanosecond datetime64")
df = pd.DataFrame({
'datetime64': np.array([
'2007-07-13T01:23:34.123',
'2006-01-13T12:34:56.432',
'2010-08-13T05:46:57.437'],
- dtype='datetime64[ms]').astype("datetime64[ns]")
+ dtype=f'datetime64[{unit}]')
})
df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
_check_pandas_roundtrip(df)
_check_series_roundtrip(df['datetime64'])
- # drop-in a null and ns instead of ms
+ # drop-in a null
df = pd.DataFrame({
'datetime64': np.array([
'2007-07-13T01:23:34.123456789',
None,
'2006-01-13T12:34:56.432539784',
'2010-08-13T05:46:57.437699912'],
- dtype='datetime64[ns]')
+ dtype=f'datetime64[{unit}]')
})
df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern')
@@ -1054,8 +1065,11 @@ class TestConvertDateTimeLikeTypes:
assert isinstance(table[0].chunk(0), pa.TimestampArray)
result = table.to_pandas()
+ # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+ # so we need to cast the pandas dtype. Pandas v1 will always silently
+ # coerce to [ns] due to lack of non-[ns] support.
expected_df = pd.DataFrame({
- 'datetime': date_array
+ 'datetime': pd.Series(date_array, dtype='datetime64[us]')
})
tm.assert_frame_equal(expected_df, result)
@@ -1108,7 +1122,12 @@ class TestConvertDateTimeLikeTypes:
assert isinstance(table[0].chunk(0), pa.TimestampArray)
result = table.to_pandas()
- expected_df = pd.DataFrame({"datetime": date_array})
+
+ # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+ # so we need to cast the pandas dtype. Pandas v1 will always silently
+ # coerce to [ns] due to lack of non-[ns] support.
+ expected_df = pd.DataFrame(
+ {"datetime": pd.Series(date_array, dtype='datetime64[us]')})
# https://github.com/pandas-dev/pandas/issues/21142
expected_df["datetime"] = pd.to_datetime(expected_df["datetime"])
@@ -1169,31 +1188,42 @@ class TestConvertDateTimeLikeTypes:
assert arr.equals(expected)
- def test_array_types_date_as_object(self):
+ @pytest.mark.parametrize("coerce_to_ns,expected_dtype",
+ [(False, 'datetime64[ms]'),
+ (True, 'datetime64[ns]')])
+ def test_array_types_date_as_object(self, coerce_to_ns, expected_dtype):
data = [date(2000, 1, 1),
None,
date(1970, 1, 1),
date(2040, 2, 26)]
- expected_d = np.array(['2000-01-01', None, '1970-01-01',
- '2040-02-26'], dtype='datetime64[D]')
+ expected_days = np.array(['2000-01-01', None, '1970-01-01',
+ '2040-02-26'], dtype='datetime64[D]')
+
+ if Version(pd.__version__) < Version("2.0.0"):
+ # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+ expected_dtype = 'datetime64[ns]'
- expected_ns = np.array(['2000-01-01', None, '1970-01-01',
- '2040-02-26'], dtype='datetime64[ns]')
+ expected = np.array(['2000-01-01', None, '1970-01-01',
+ '2040-02-26'], dtype=expected_dtype)
objects = [pa.array(data),
pa.chunked_array([data])]
for obj in objects:
- result = obj.to_pandas()
- expected_obj = expected_d.astype(object)
+ result = obj.to_pandas(coerce_temporal_nanoseconds=coerce_to_ns)
+ expected_obj = expected_days.astype(object)
assert result.dtype == expected_obj.dtype
npt.assert_array_equal(result, expected_obj)
- result = obj.to_pandas(date_as_object=False)
- assert result.dtype == expected_ns.dtype
- npt.assert_array_equal(result, expected_ns)
+ result = obj.to_pandas(date_as_object=False,
+ coerce_temporal_nanoseconds=coerce_to_ns)
+ assert result.dtype == expected.dtype
+ npt.assert_array_equal(result, expected)
- def test_table_convert_date_as_object(self):
+ @pytest.mark.parametrize("coerce_to_ns,expected_type",
+ [(False, 'datetime64[ms]'),
+ (True, 'datetime64[ns]')])
+ def test_table_convert_date_as_object(self, coerce_to_ns, expected_type):
df = pd.DataFrame({
'date': [date(2000, 1, 1),
None,
@@ -1202,13 +1232,51 @@ class TestConvertDateTimeLikeTypes:
table = pa.Table.from_pandas(df, preserve_index=False)
- df_datetime = table.to_pandas(date_as_object=False)
+ df_datetime = table.to_pandas(date_as_object=False,
+ coerce_temporal_nanoseconds=coerce_to_ns)
df_object = table.to_pandas()
- tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime,
+ tm.assert_frame_equal(df.astype(expected_type), df_datetime,
check_dtype=True)
tm.assert_frame_equal(df, df_object, check_dtype=True)
+ @pytest.mark.parametrize("arrow_type",
+ [pa.date32(), pa.date64(), pa.timestamp('s'),
+ pa.timestamp('ms'), pa.timestamp('us'),
+ pa.timestamp('ns'), pa.timestamp('s', 'UTC'),
+ pa.timestamp('ms', 'UTC'), pa.timestamp('us',
'UTC'),
+ pa.timestamp('ns', 'UTC')])
+ def test_array_coerce_temporal_nanoseconds(self, arrow_type):
+ data = [date(2000, 1, 1), datetime(2001, 1, 1)]
+ expected = pd.Series(data)
+ arr = pa.array(data).cast(arrow_type)
+ result = arr.to_pandas(
+ coerce_temporal_nanoseconds=True, date_as_object=False)
+ expected_tz = None
+ if hasattr(arrow_type, 'tz') and arrow_type.tz is not None:
+ expected_tz = 'UTC'
+ expected_type = pa.timestamp('ns', expected_tz).to_pandas_dtype()
+ tm.assert_series_equal(result, expected.astype(expected_type))
+
+ @pytest.mark.parametrize("arrow_type",
+ [pa.date32(), pa.date64(), pa.timestamp('s'),
+ pa.timestamp('ms'), pa.timestamp('us'),
+ pa.timestamp('ns'), pa.timestamp('s', 'UTC'),
+ pa.timestamp('ms', 'UTC'), pa.timestamp('us',
'UTC'),
+ pa.timestamp('ns', 'UTC')])
+ def test_table_coerce_temporal_nanoseconds(self, arrow_type):
+ data = [date(2000, 1, 1), datetime(2001, 1, 1)]
+ schema = pa.schema([pa.field('date', arrow_type)])
+ expected_df = pd.DataFrame({'date': data})
+ table = pa.table([pa.array(data)], schema=schema)
+ result_df = table.to_pandas(
+ coerce_temporal_nanoseconds=True, date_as_object=False)
+ expected_tz = None
+ if hasattr(arrow_type, 'tz') and arrow_type.tz is not None:
+ expected_tz = 'UTC'
+ expected_type = pa.timestamp('ns', expected_tz).to_pandas_dtype()
+ tm.assert_frame_equal(result_df, expected_df.astype(expected_type))
+
def test_date_infer(self):
df = pd.DataFrame({
'date': [date(2000, 1, 1),
@@ -1266,9 +1334,11 @@ class TestConvertDateTimeLikeTypes:
dtype='datetime64[D]'))
ex_values[1] = pd.NaT.value
- ex_datetime64ns = ex_values.astype('datetime64[ns]')
- expected_pandas = pd.DataFrame({'date32': ex_datetime64ns,
- 'date64': ex_datetime64ns},
+ # date32 and date64 convert to [ms] in pandas v2, but
+ # in pandas v1 they are siliently coerced to [ns]
+ ex_datetime64ms = ex_values.astype('datetime64[ms]')
+ expected_pandas = pd.DataFrame({'date32': ex_datetime64ms,
+ 'date64': ex_datetime64ms},
columns=colnames)
table_pandas = table.to_pandas(date_as_object=False)
tm.assert_frame_equal(table_pandas, expected_pandas)
@@ -1428,8 +1498,11 @@ class TestConvertDateTimeLikeTypes:
dtype='datetime64[s]')
_check_array_from_pandas_roundtrip(datetime64_s)
- def test_timestamp_to_pandas_ns(self):
+ def test_timestamp_to_pandas_coerces_to_ns(self):
# non-ns timestamp gets cast to ns on conversion to pandas
+ if Version(pd.__version__) >= Version("2.0.0"):
+ pytest.skip("pandas >= 2.0 supports non-nanosecond datetime64")
+
arr = pa.array([1, 2, 3], pa.timestamp('ms'))
expected = pd.Series(pd.to_datetime([1, 2, 3], unit='ms'))
s = arr.to_pandas()
@@ -1440,13 +1513,7 @@ class TestConvertDateTimeLikeTypes:
def test_timestamp_to_pandas_out_of_bounds(self):
# ARROW-7758 check for out of bounds timestamps for non-ns timestamps
-
- if Version(pd.__version__) >= Version("2.1.0.dev"):
- # GH-35235: test fail due to __from_pyarrow__ being added to pandas
- # https://github.com/pandas-dev/pandas/pull/52201
- # Needs: https://github.com/apache/arrow/issues/33321
- pytest.skip(
- "Need support converting to non-nano datetime64 for pandas >=
2.0")
+ # that end up getting coerced into ns timestamps.
for unit in ['s', 'ms', 'us']:
for tz in [None, 'America/New_York']:
@@ -1455,26 +1522,27 @@ class TestConvertDateTimeLikeTypes:
msg = "would result in out of bounds timestamp"
with pytest.raises(ValueError, match=msg):
- arr.to_pandas()
+ arr.to_pandas(coerce_temporal_nanoseconds=True)
with pytest.raises(ValueError, match=msg):
- table.to_pandas()
+ table.to_pandas(coerce_temporal_nanoseconds=True)
with pytest.raises(ValueError, match=msg):
# chunked array
- table.column('a').to_pandas()
+
table.column('a').to_pandas(coerce_temporal_nanoseconds=True)
# just ensure those don't give an error, but do not
# check actual garbage output
- arr.to_pandas(safe=False)
- table.to_pandas(safe=False)
- table.column('a').to_pandas(safe=False)
+ arr.to_pandas(safe=False, coerce_temporal_nanoseconds=True)
+ table.to_pandas(safe=False, coerce_temporal_nanoseconds=True)
+ table.column('a').to_pandas(
+ safe=False, coerce_temporal_nanoseconds=True)
def test_timestamp_to_pandas_empty_chunked(self):
# ARROW-7907 table with chunked array with 0 chunks
table = pa.table({'a': pa.chunked_array([], type=pa.timestamp('us'))})
result = table.to_pandas()
- expected = pd.DataFrame({'a': pd.Series([], dtype="datetime64[ns]")})
+ expected = pd.DataFrame({'a': pd.Series([], dtype="datetime64[us]")})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()])
@@ -1516,24 +1584,30 @@ class TestConvertDateTimeLikeTypes:
# TODO remove if https://github.com/apache/arrow/issues/15047 is fixed
_check_pandas_roundtrip(df, check_dtype=False)
- def test_timedeltas_no_nulls(self):
+ @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+ def test_timedeltas_no_nulls(self, unit):
+ if Version(pd.__version__) < Version("2.0.0"):
+ unit = 'ns'
df = pd.DataFrame({
'timedelta64': np.array([0, 3600000000000, 7200000000000],
- dtype='timedelta64[ns]')
+ dtype=f'timedelta64[{unit}]')
})
- field = pa.field('timedelta64', pa.duration('ns'))
+ field = pa.field('timedelta64', pa.duration(unit))
schema = pa.schema([field])
_check_pandas_roundtrip(
df,
expected_schema=schema,
)
- def test_timedeltas_nulls(self):
+ @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+ def test_timedeltas_nulls(self, unit):
+ if Version(pd.__version__) < Version("2.0.0"):
+ unit = 'ns'
df = pd.DataFrame({
'timedelta64': np.array([0, None, 7200000000000],
- dtype='timedelta64[ns]')
+ dtype=f'timedelta64[{unit}]')
})
- field = pa.field('timedelta64', pa.duration('ns'))
+ field = pa.field('timedelta64', pa.duration(unit))
schema = pa.schema([field])
_check_pandas_roundtrip(
df,
@@ -2855,7 +2929,7 @@ class TestConvertMisc:
cases.append(boolean_objects)
cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
- dtype='datetime64[ms]').astype("datetime64[ns]")
+ dtype='datetime64[ms]')
.reshape(N, K).copy())
strided_mask = (random_numbers > 0).astype(bool)[:, 0]
@@ -3384,7 +3458,7 @@ def test_table_from_pandas_schema_with_custom_metadata():
assert table.schema.metadata.get(b'meta') == b'True'
-def test_table_from_pandas_schema_field_order_metadat():
+def test_table_from_pandas_schema_field_order_metadata():
# ARROW-10532
# ensure that a different field order in specified schema doesn't
# mangle metadata
@@ -3408,7 +3482,12 @@ def test_table_from_pandas_schema_field_order_metadat():
assert metadata_datetime["metadata"] == {'timezone': 'UTC'}
result = table.to_pandas()
- expected = df[["float", "datetime"]].astype({"float": "float32"})
+ coerce_cols_to_types = {"float": "float32"}
+ if Version(pd.__version__) >= Version("2.0.0"):
+ # Pandas v2 now support non-nanosecond time units
+ coerce_cols_to_types["datetime"] = "datetime64[s, UTC]"
+ expected = df[["float", "datetime"]].astype(coerce_cols_to_types)
+
tm.assert_frame_equal(result, expected)
@@ -4179,20 +4258,20 @@ def test_to_pandas_extension_dtypes_mapping():
assert isinstance(result['a'].dtype, pd.PeriodDtype)
-def test_array_to_pandas():
[email protected]("arr",
+ [pd.period_range("2012-01-01", periods=3,
freq="D").array,
+ pd.interval_range(1, 4).array])
+def test_array_to_pandas(arr):
if Version(pd.__version__) < Version("1.1"):
pytest.skip("ExtensionDtype to_pandas method missing")
- for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array,
- pd.interval_range(1, 4).array]:
- result = pa.array(arr).to_pandas()
- expected = pd.Series(arr)
- tm.assert_series_equal(result, expected)
-
- # TODO implement proper conversion for chunked array
- # result = pa.table({"col": arr})["col"].to_pandas()
- # expected = pd.Series(arr, name="col")
- # tm.assert_series_equal(result, expected)
+ result = pa.array(arr).to_pandas()
+ expected = pd.Series(arr)
+ tm.assert_series_equal(result, expected)
+
+ result = pa.table({"col": arr})["col"].to_pandas()
+ expected = pd.Series(arr, name="col")
+ tm.assert_series_equal(result, expected)
def test_roundtrip_empty_table_with_extension_dtype_index():
diff --git a/python/pyarrow/tests/test_schema.py
b/python/pyarrow/tests/test_schema.py
index 0c4dea673b..2f2417f590 100644
--- a/python/pyarrow/tests/test_schema.py
+++ b/python/pyarrow/tests/test_schema.py
@@ -25,6 +25,12 @@ import numpy as np
import pyarrow as pa
import pyarrow.tests.util as test_util
+from pyarrow.vendored.version import Version
+
+try:
+ import pandas as pd
+except ImportError:
+ pass
def test_schema_constructor_errors():
@@ -45,7 +51,9 @@ def test_type_integers():
def test_type_to_pandas_dtype():
- M8_ns = np.dtype('datetime64[ns]')
+ M8 = np.dtype('datetime64[ms]')
+ if Version(pd.__version__) < Version("2.0.0"):
+ M8 = np.dtype('datetime64[ns]')
cases = [
(pa.null(), np.object_),
(pa.bool_(), np.bool_),
@@ -60,9 +68,9 @@ def test_type_to_pandas_dtype():
(pa.float16(), np.float16),
(pa.float32(), np.float32),
(pa.float64(), np.float64),
- (pa.date32(), M8_ns),
- (pa.date64(), M8_ns),
- (pa.timestamp('ms'), M8_ns),
+ (pa.date32(), M8),
+ (pa.date64(), M8),
+ (pa.timestamp('ms'), M8),
(pa.binary(), np.object_),
(pa.binary(12), np.object_),
(pa.string(), np.object_),
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index a3311cbbcf..fbd4f8a94b 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -40,10 +40,21 @@ cdef dict _pandas_type_map = {
_Type_HALF_FLOAT: np.float16,
_Type_FLOAT: np.float32,
_Type_DOUBLE: np.float64,
- _Type_DATE32: np.dtype('datetime64[ns]'),
- _Type_DATE64: np.dtype('datetime64[ns]'),
- _Type_TIMESTAMP: np.dtype('datetime64[ns]'),
- _Type_DURATION: np.dtype('timedelta64[ns]'),
+ # Pandas does not support [D]ay, so default to [ms] for date32
+ _Type_DATE32: np.dtype('datetime64[ms]'),
+ _Type_DATE64: np.dtype('datetime64[ms]'),
+ _Type_TIMESTAMP: {
+ 's': np.dtype('datetime64[s]'),
+ 'ms': np.dtype('datetime64[ms]'),
+ 'us': np.dtype('datetime64[us]'),
+ 'ns': np.dtype('datetime64[ns]'),
+ },
+ _Type_DURATION: {
+ 's': np.dtype('timedelta64[s]'),
+ 'ms': np.dtype('timedelta64[ms]'),
+ 'us': np.dtype('timedelta64[us]'),
+ 'ns': np.dtype('timedelta64[ns]'),
+ },
_Type_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
@@ -115,6 +126,44 @@ def _is_primitive(Type type):
return is_primitive(type)
+def _get_pandas_type(arrow_type, coerce_to_ns=False):
+ cdef Type type_id = arrow_type.id
+ if type_id not in _pandas_type_map:
+ return None
+ if coerce_to_ns:
+ # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+ if type_id == _Type_DURATION:
+ return np.dtype('timedelta64[ns]')
+ return np.dtype('datetime64[ns]')
+ pandas_type = _pandas_type_map[type_id]
+ if isinstance(pandas_type, dict):
+ unit = getattr(arrow_type, 'unit', None)
+ pandas_type = pandas_type.get(unit, None)
+ return pandas_type
+
+
+def _get_pandas_tz_type(arrow_type, coerce_to_ns=False):
+ from pyarrow.pandas_compat import make_datetimetz
+ unit = 'ns' if coerce_to_ns else arrow_type.unit
+ return make_datetimetz(unit, arrow_type.tz)
+
+
+def _to_pandas_dtype(arrow_type, options=None):
+ coerce_to_ns = (options and options.get('coerce_temporal_nanoseconds',
False)) or (
+ _pandas_api.is_v1() and arrow_type.id in
+ [_Type_DATE32, _Type_DATE64, _Type_TIMESTAMP, _Type_DURATION])
+
+ if getattr(arrow_type, 'tz', None):
+ dtype = _get_pandas_tz_type(arrow_type, coerce_to_ns)
+ else:
+ dtype = _get_pandas_type(arrow_type, coerce_to_ns)
+
+ if not dtype:
+ raise NotImplementedError(str(arrow_type))
+
+ return dtype
+
+
# Workaround for Cython parsing bug
# https://github.com/cython/cython/issues/2143
ctypedef CFixedWidthType* _CFixedWidthTypePtr
@@ -274,11 +323,7 @@ cdef class DataType(_Weakrefable):
>>> pa.int64().to_pandas_dtype()
<class 'numpy.int64'>
"""
- cdef Type type_id = self.type.id()
- if type_id in _pandas_type_map:
- return _pandas_type_map[type_id]
- else:
- raise NotImplementedError(str(self))
+ return _to_pandas_dtype(self)
def _export_to_c(self, out_ptr):
"""
@@ -1005,24 +1050,6 @@ cdef class TimestampType(DataType):
else:
return None
- def to_pandas_dtype(self):
- """
- Return the equivalent NumPy / Pandas dtype.
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> t = pa.timestamp('s', tz='UTC')
- >>> t.to_pandas_dtype()
- datetime64[ns, UTC]
- """
- if self.tz is None:
- return _pandas_type_map[_Type_TIMESTAMP]
- else:
- # Return DatetimeTZ
- from pyarrow.pandas_compat import make_datetimetz
- return make_datetimetz(self.tz)
-
def __reduce__(self):
return timestamp, (self.unit, self.tz)