This is an automated email from the ASF dual-hosted git repository.
sgilmore pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 697f5013a8 GH-41476: [Python][C++] Impossible to specify
`is_adjusted_to_utc` for `Time` type when writing to Parquet (#47316)
697f5013a8 is described below
commit 697f5013a8037e422f6009ef9f216c1c170922c6
Author: Sarah Gilmore <[email protected]>
AuthorDate: Tue Aug 26 11:49:35 2025 -0400
GH-41476: [Python][C++] Impossible to specify `is_adjusted_to_utc` for
`Time` type when writing to Parquet (#47316)
### Rationale for this change
As of today, it's not possible to write Parquet `TIME` data whose
`isAdjustedToUTC` parameter is `false`. Instead, `isAdjustedToUTC` is
hard-coded to `true`
[here](https://github.com/apache/arrow/blob/2dd3ccda6437f79aa34641bd3197dd7392ae4aec/cpp/src/parquet/arrow/schema.cc#L431).
Unfortunately, some Parquet consumers only support `TIME` data if the
`isAdjustedToUTC` parameter is `false`, meaning they cannot import Parquet
`TIME` data generated by our Parquet Writer. For example, the apache/spark
Parquet reader only supports Parquet `TIME` columns if [`isAdjustedToUTC=false`
and
`units=MICROSECONDS`](https://github.com/apache/spark/blob/554f6b64f1e2b2346499f6d3340a3695244bfc84/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSc
[...]
Adding support for writing `TIME` data with the `isAdjustedToUTC` set to
`false` would unblock users who need to write Spark-compatible Parquet data.
### What changes are included in this PR?
1. Added a `write_time_adjusted_to_utc` as a property to
`parquet::ArrowWriterProperties`. If `true`, all `TIME` columns have their
`isAdjustedToUTC` parameters set to `true`. Otherwise, `isAdjustedToUTC` is set
to `false` for all `TIME` columns. This property is `false` by default.
2. Added `enable_write_time_adjusted_to_utc()` and
`disable_write_time_adjusted_to_utc()` methods to
`parquet::ArrowWriterProperties::Builder`.
### Are these changes tested?
Yes. I added test case `ParquetTimeAdjustedToUTC` to test suite
`TestConvertArrowSchema`.
### Are there any user-facing changes?
Yes. Users can now configure the `isAdjustedToUTC` parameter for Parquet
`TIME` data.
NOTE: This change introduces an incompatibility. The default value for
`isAdjustedToUTC` parameter is now `false` instead of `true`.
### NOTE
1. I did not update the PyArrow interface because I am not familiar with
that code base. I was planning on creating a new GitHub issue to track that
work separately.
2. There already exists an open PR (#43268) for addressing this issue.
However, that PR was last active over a year ago and seems stale.
* GitHub Issue: #41476
Lead-authored-by: Sarah Gilmore <[email protected]>
Co-authored-by: Sarah Gilmore <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sarah Gilmore <[email protected]>
---
cpp/src/parquet/arrow/arrow_schema_test.cc | 59 ++++++++++++++++++++++++++++--
cpp/src/parquet/arrow/schema.cc | 15 +++++---
cpp/src/parquet/properties.h | 28 ++++++++++++--
3 files changed, 89 insertions(+), 13 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index e1a4425149..73ce8ea69e 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -1352,11 +1352,11 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
{"float16", ::arrow::float16(), LogicalType::Float16(),
ParquetType::FIXED_LEN_BYTE_ARRAY, 2},
{"time32", ::arrow::time32(::arrow::TimeUnit::MILLI),
- LogicalType::Time(true, LogicalType::TimeUnit::MILLIS),
ParquetType::INT32, -1},
+ LogicalType::Time(false, LogicalType::TimeUnit::MILLIS),
ParquetType::INT32, -1},
{"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO),
- LogicalType::Time(true, LogicalType::TimeUnit::MICROS),
ParquetType::INT64, -1},
+ LogicalType::Time(false, LogicalType::TimeUnit::MICROS),
ParquetType::INT64, -1},
{"time64(nanosecond)", ::arrow::time64(::arrow::TimeUnit::NANO),
- LogicalType::Time(true, LogicalType::TimeUnit::NANOS),
ParquetType::INT64, -1},
+ LogicalType::Time(false, LogicalType::TimeUnit::NANOS),
ParquetType::INT64, -1},
{"timestamp(millisecond)", ::arrow::timestamp(::arrow::TimeUnit::MILLI),
LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS,
/*is_from_converted_type=*/false,
@@ -1782,6 +1782,59 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
}
+TEST_F(TestConvertArrowSchema, ParquetTimeAdjustedToUTC) {
+ // Verify Parquet Time types have the appropriate isAdjustedToUTC value,
depending
+ // on the return value of ArrowWriterProperties::write_time_adjusted_to_utc()
+
+ struct FieldConstructionArguments {
+ std::string name;
+ std::shared_ptr<::arrow::DataType> datatype;
+ std::shared_ptr<const LogicalType> logical_type;
+ parquet::Type::type physical_type;
+ int physical_length;
+ };
+
+ auto run_test =
+ [this](const std::shared_ptr<ArrowWriterProperties>&
arrow_writer_properties,
+ bool time_adjusted_to_utc) {
+ std::vector<FieldConstructionArguments> cases = {
+ {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI),
+ LogicalType::Time(time_adjusted_to_utc,
LogicalType::TimeUnit::MILLIS),
+ ParquetType::INT32, -1},
+ {"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO),
+ LogicalType::Time(time_adjusted_to_utc,
LogicalType::TimeUnit::MICROS),
+ ParquetType::INT64, -1},
+ {"time64(nanosecond)", ::arrow::time64(::arrow::TimeUnit::NANO),
+ LogicalType::Time(time_adjusted_to_utc,
LogicalType::TimeUnit::NANOS),
+ ParquetType::INT64, -1}};
+
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ std::vector<NodePtr> parquet_fields;
+ for (const FieldConstructionArguments& c : cases) {
+ arrow_fields.push_back(::arrow::field(c.name, c.datatype, false));
+ parquet_fields.push_back(PrimitiveNode::Make(c.name,
Repetition::REQUIRED,
+ c.logical_type,
c.physical_type,
+ c.physical_length));
+ }
+
+ EXPECT_EQ(arrow_writer_properties->write_time_adjusted_to_utc(),
+ time_adjusted_to_utc);
+ ASSERT_OK(ConvertSchema(arrow_fields, arrow_writer_properties));
+ CheckFlatSchema(parquet_fields);
+ };
+
+ // Verify write_time_adjusted_to_utc is false by default.
+ ArrowWriterProperties::Builder builder;
+ auto arrow_writer_properties = builder.build();
+ run_test(arrow_writer_properties, false);
+
+ arrow_writer_properties = builder.set_time_adjusted_to_utc(true)->build();
+ run_test(arrow_writer_properties, true);
+
+ arrow_writer_properties = builder.set_time_adjusted_to_utc(false)->build();
+ run_test(arrow_writer_properties, false);
+}
+
class TestConvertRoundTrip : public ::testing::Test {
public:
::arrow::Status RoundTripSchema(
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index f0dc45b785..2beb2c66ef 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -420,18 +420,21 @@ Status FieldToNode(const std::string& name, const
std::shared_ptr<Field>& field,
break;
case ArrowTypeId::TIME32:
type = ParquetType::INT32;
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true,
LogicalType::TimeUnit::MILLIS);
+ logical_type = LogicalType::Time(
+ /*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+ LogicalType::TimeUnit::MILLIS);
break;
case ArrowTypeId::TIME64: {
type = ParquetType::INT64;
auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
if (time_type->unit() == ::arrow::TimeUnit::NANO) {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true,
LogicalType::TimeUnit::NANOS);
+ logical_type = LogicalType::Time(
+
/*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+ LogicalType::TimeUnit::NANOS);
} else {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true,
LogicalType::TimeUnit::MICROS);
+ logical_type = LogicalType::Time(
+
/*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+ LogicalType::TimeUnit::MICROS);
}
} break;
case ArrowTypeId::DURATION:
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index bbaf6b5e71..27bf672f86 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -1161,7 +1161,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
compliant_nested_types_(true),
engine_version_(V2),
use_threads_(kArrowDefaultUseThreads),
- executor_(NULLPTR) {}
+ executor_(NULLPTR),
+ write_time_adjusted_to_utc_(false) {}
virtual ~Builder() = default;
/// \brief Disable writing legacy int96 timestamps (default disabled).
@@ -1256,12 +1257,21 @@ class PARQUET_EXPORT ArrowWriterProperties {
return this;
}
+ /// \brief Set the value of isAdjustedTOUTC when writing a TIME column
+ ///
+ /// Default is false because Arrow TIME data is expressed in an
unspecified timezone.
+ /// Note this setting doesn't affect TIMESTAMP data.
+ Builder* set_time_adjusted_to_utc(bool adjusted) {
+ write_time_adjusted_to_utc_ = adjusted;
+ return this;
+ }
+
/// Create the final properties.
std::shared_ptr<ArrowWriterProperties> build() {
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
write_timestamps_as_int96_, coerce_timestamps_enabled_,
coerce_timestamps_unit_,
truncated_timestamps_allowed_, store_schema_,
compliant_nested_types_,
- engine_version_, use_threads_, executor_));
+ engine_version_, use_threads_, executor_,
write_time_adjusted_to_utc_));
}
private:
@@ -1277,6 +1287,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
bool use_threads_;
::arrow::internal::Executor* executor_;
+
+ bool write_time_adjusted_to_utc_;
};
bool support_deprecated_int96_timestamps() const { return
write_timestamps_as_int96_; }
@@ -1310,6 +1322,11 @@ class PARQUET_EXPORT ArrowWriterProperties {
/// \brief Returns the executor used to write columns in parallel.
::arrow::internal::Executor* executor() const;
+ /// \brief The value of isAdjustedTOUTC when writing a TIME column
+ ///
+ /// Note this setting doesn't affect TIMESTAMP data.
+ bool write_time_adjusted_to_utc() const { return
write_time_adjusted_to_utc_; }
+
private:
explicit ArrowWriterProperties(bool write_nanos_as_int96,
bool coerce_timestamps_enabled,
@@ -1317,7 +1334,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
bool truncated_timestamps_allowed, bool
store_schema,
bool compliant_nested_types,
EngineVersion engine_version, bool
use_threads,
- ::arrow::internal::Executor* executor)
+ ::arrow::internal::Executor* executor,
+ bool write_time_adjusted_to_utc)
: write_timestamps_as_int96_(write_nanos_as_int96),
coerce_timestamps_enabled_(coerce_timestamps_enabled),
coerce_timestamps_unit_(coerce_timestamps_unit),
@@ -1326,7 +1344,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
compliant_nested_types_(compliant_nested_types),
engine_version_(engine_version),
use_threads_(use_threads),
- executor_(executor) {}
+ executor_(executor),
+ write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
const bool write_timestamps_as_int96_;
const bool coerce_timestamps_enabled_;
@@ -1337,6 +1356,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
const EngineVersion engine_version_;
const bool use_threads_;
::arrow::internal::Executor* executor_;
+ const bool write_time_adjusted_to_utc_;
};
/// \brief State object used for writing Arrow data directly to a Parquet