This is an automated email from the ASF dual-hosted git repository.

sgilmore pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 697f5013a8 GH-41476: [Python][C++] Impossible to specify 
`is_adjusted_to_utc` for `Time` type when writing to Parquet (#47316)
697f5013a8 is described below

commit 697f5013a8037e422f6009ef9f216c1c170922c6
Author: Sarah Gilmore <[email protected]>
AuthorDate: Tue Aug 26 11:49:35 2025 -0400

    GH-41476: [Python][C++] Impossible to specify `is_adjusted_to_utc` for 
`Time` type when writing to Parquet (#47316)
    
    ### Rationale for this change
    
    As of today, it's not possible to write Parquet `TIME` data  whose 
`isAdjustedToUTC` parameter is `false`.  Instead, `isAdjustedToUTC` is 
hard-coded to `true` 
[here](https://github.com/apache/arrow/blob/2dd3ccda6437f79aa34641bd3197dd7392ae4aec/cpp/src/parquet/arrow/schema.cc#L431).
    
    Unfortunately, some Parquet consumers only support `TIME` data if the 
`isAdjustedToUTC` parameter is `false`, meaning they cannot import Parquet 
`TIME` data generated by our Parquet Writer.  For example, the apache/spark 
Parquet reader only supports Parquet `TIME` columns if [`isAdjustedToUTC=false` 
and 
`units=MICROSECONDS`](https://github.com/apache/spark/blob/554f6b64f1e2b2346499f6d3340a3695244bfc84/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSc
 [...]
    
    Adding support for writing `TIME` data with the `isAdjustedToUTC` set to 
`false` would unblock users who need to write Spark-compatible Parquet data.
    
    ### What changes are included in this PR?
    
    1. Added a `write_time_adjusted_to_utc` as a property to 
`parquet::ArrowWriterProperties`. If `true`, all `TIME` columns have their 
`isAdjustedToUTC` parameters set to `true`. Otherwise, `isAdjustedToUTC` is set 
to `false` for all `TIME` columns. This property is `false` by default.
    2. Added `enable_write_time_adjusted_to_utc()` and 
`disable_write_time_adjusted_to_utc()` methods to 
`parquet::ArrowWriterProperties::Builder`.
    
    ### Are these changes tested?
    
    Yes. I added test case `ParquetTimeAdjustedToUTC` to test suite 
`TestConvertArrowSchema`.
    
    ### Are there any user-facing changes?
    
    Yes. Users can now configure the `isAdjustedToUTC` parameter for Parquet 
`TIME` data.
    
    NOTE: This change introduces an incompatibility. The default value for 
`isAdjustedToUTC` parameter is now `false` instead of `true`.
    
    ### NOTE
    
    1. I did not update the PyArrow interface because I am not familiar with 
that code base. I was planning on creating a new GitHub issue to track that 
work separately.
    2. There already exists an open PR (#43268) for addressing this issue. 
However, that PR was last active over a year ago and seems stale.
    
    * GitHub Issue: #41476
    
    Lead-authored-by: Sarah Gilmore <[email protected]>
    Co-authored-by: Sarah Gilmore <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Sarah Gilmore <[email protected]>
---
 cpp/src/parquet/arrow/arrow_schema_test.cc | 59 ++++++++++++++++++++++++++++--
 cpp/src/parquet/arrow/schema.cc            | 15 +++++---
 cpp/src/parquet/properties.h               | 28 ++++++++++++--
 3 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc 
b/cpp/src/parquet/arrow/arrow_schema_test.cc
index e1a4425149..73ce8ea69e 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -1352,11 +1352,11 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
       {"float16", ::arrow::float16(), LogicalType::Float16(),
        ParquetType::FIXED_LEN_BYTE_ARRAY, 2},
       {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI),
-       LogicalType::Time(true, LogicalType::TimeUnit::MILLIS), 
ParquetType::INT32, -1},
+       LogicalType::Time(false, LogicalType::TimeUnit::MILLIS), 
ParquetType::INT32, -1},
       {"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO),
-       LogicalType::Time(true, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64, -1},
+       LogicalType::Time(false, LogicalType::TimeUnit::MICROS), 
ParquetType::INT64, -1},
       {"time64(nanosecond)", ::arrow::time64(::arrow::TimeUnit::NANO),
-       LogicalType::Time(true, LogicalType::TimeUnit::NANOS), 
ParquetType::INT64, -1},
+       LogicalType::Time(false, LogicalType::TimeUnit::NANOS), 
ParquetType::INT64, -1},
       {"timestamp(millisecond)", ::arrow::timestamp(::arrow::TimeUnit::MILLI),
        LogicalType::Timestamp(false, LogicalType::TimeUnit::MILLIS,
                               /*is_from_converted_type=*/false,
@@ -1782,6 +1782,59 @@ TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) {
   ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(parquet_fields));
 }
 
+TEST_F(TestConvertArrowSchema, ParquetTimeAdjustedToUTC) {
+  // Verify Parquet Time types have the appropriate isAdjustedToUTC value, 
depending
+  // on the return value of ArrowWriterProperties::write_time_adjusted_to_utc()
+
+  struct FieldConstructionArguments {
+    std::string name;
+    std::shared_ptr<::arrow::DataType> datatype;
+    std::shared_ptr<const LogicalType> logical_type;
+    parquet::Type::type physical_type;
+    int physical_length;
+  };
+
+  auto run_test =
+      [this](const std::shared_ptr<ArrowWriterProperties>& 
arrow_writer_properties,
+             bool time_adjusted_to_utc) {
+        std::vector<FieldConstructionArguments> cases = {
+            {"time32", ::arrow::time32(::arrow::TimeUnit::MILLI),
+             LogicalType::Time(time_adjusted_to_utc, 
LogicalType::TimeUnit::MILLIS),
+             ParquetType::INT32, -1},
+            {"time64(microsecond)", ::arrow::time64(::arrow::TimeUnit::MICRO),
+             LogicalType::Time(time_adjusted_to_utc, 
LogicalType::TimeUnit::MICROS),
+             ParquetType::INT64, -1},
+            {"time64(nanosecond)", ::arrow::time64(::arrow::TimeUnit::NANO),
+             LogicalType::Time(time_adjusted_to_utc, 
LogicalType::TimeUnit::NANOS),
+             ParquetType::INT64, -1}};
+
+        std::vector<std::shared_ptr<Field>> arrow_fields;
+        std::vector<NodePtr> parquet_fields;
+        for (const FieldConstructionArguments& c : cases) {
+          arrow_fields.push_back(::arrow::field(c.name, c.datatype, false));
+          parquet_fields.push_back(PrimitiveNode::Make(c.name, 
Repetition::REQUIRED,
+                                                       c.logical_type, 
c.physical_type,
+                                                       c.physical_length));
+        }
+
+        EXPECT_EQ(arrow_writer_properties->write_time_adjusted_to_utc(),
+                  time_adjusted_to_utc);
+        ASSERT_OK(ConvertSchema(arrow_fields, arrow_writer_properties));
+        CheckFlatSchema(parquet_fields);
+      };
+
+  // Verify write_time_adjusted_to_utc is false by default.
+  ArrowWriterProperties::Builder builder;
+  auto arrow_writer_properties = builder.build();
+  run_test(arrow_writer_properties, false);
+
+  arrow_writer_properties = builder.set_time_adjusted_to_utc(true)->build();
+  run_test(arrow_writer_properties, true);
+
+  arrow_writer_properties = builder.set_time_adjusted_to_utc(false)->build();
+  run_test(arrow_writer_properties, false);
+}
+
 class TestConvertRoundTrip : public ::testing::Test {
  public:
   ::arrow::Status RoundTripSchema(
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index f0dc45b785..2beb2c66ef 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -420,18 +420,21 @@ Status FieldToNode(const std::string& name, const 
std::shared_ptr<Field>& field,
       break;
     case ArrowTypeId::TIME32:
       type = ParquetType::INT32;
-      logical_type =
-          LogicalType::Time(/*is_adjusted_to_utc=*/true, 
LogicalType::TimeUnit::MILLIS);
+      logical_type = LogicalType::Time(
+          /*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+          LogicalType::TimeUnit::MILLIS);
       break;
     case ArrowTypeId::TIME64: {
       type = ParquetType::INT64;
       auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
       if (time_type->unit() == ::arrow::TimeUnit::NANO) {
-        logical_type =
-            LogicalType::Time(/*is_adjusted_to_utc=*/true, 
LogicalType::TimeUnit::NANOS);
+        logical_type = LogicalType::Time(
+            
/*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+            LogicalType::TimeUnit::NANOS);
       } else {
-        logical_type =
-            LogicalType::Time(/*is_adjusted_to_utc=*/true, 
LogicalType::TimeUnit::MICROS);
+        logical_type = LogicalType::Time(
+            
/*is_adjusted_to_utc=*/arrow_properties.write_time_adjusted_to_utc(),
+            LogicalType::TimeUnit::MICROS);
       }
     } break;
     case ArrowTypeId::DURATION:
diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
index bbaf6b5e71..27bf672f86 100644
--- a/cpp/src/parquet/properties.h
+++ b/cpp/src/parquet/properties.h
@@ -1161,7 +1161,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
           compliant_nested_types_(true),
           engine_version_(V2),
           use_threads_(kArrowDefaultUseThreads),
-          executor_(NULLPTR) {}
+          executor_(NULLPTR),
+          write_time_adjusted_to_utc_(false) {}
     virtual ~Builder() = default;
 
     /// \brief Disable writing legacy int96 timestamps (default disabled).
@@ -1256,12 +1257,21 @@ class PARQUET_EXPORT ArrowWriterProperties {
       return this;
     }
 
+    /// \brief Set the value of isAdjustedTOUTC when writing a TIME column
+    ///
+    /// Default is false because Arrow TIME data is expressed in an 
unspecified timezone.
+    /// Note this setting doesn't affect TIMESTAMP data.
+    Builder* set_time_adjusted_to_utc(bool adjusted) {
+      write_time_adjusted_to_utc_ = adjusted;
+      return this;
+    }
+
     /// Create the final properties.
     std::shared_ptr<ArrowWriterProperties> build() {
       return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
           write_timestamps_as_int96_, coerce_timestamps_enabled_, 
coerce_timestamps_unit_,
           truncated_timestamps_allowed_, store_schema_, 
compliant_nested_types_,
-          engine_version_, use_threads_, executor_));
+          engine_version_, use_threads_, executor_, 
write_time_adjusted_to_utc_));
     }
 
    private:
@@ -1277,6 +1287,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
 
     bool use_threads_;
     ::arrow::internal::Executor* executor_;
+
+    bool write_time_adjusted_to_utc_;
   };
 
   bool support_deprecated_int96_timestamps() const { return 
write_timestamps_as_int96_; }
@@ -1310,6 +1322,11 @@ class PARQUET_EXPORT ArrowWriterProperties {
   /// \brief Returns the executor used to write columns in parallel.
   ::arrow::internal::Executor* executor() const;
 
+  /// \brief The value of isAdjustedTOUTC when writing a TIME column
+  ///
+  /// Note this setting doesn't affect TIMESTAMP data.
+  bool write_time_adjusted_to_utc() const { return 
write_time_adjusted_to_utc_; }
+
  private:
   explicit ArrowWriterProperties(bool write_nanos_as_int96,
                                  bool coerce_timestamps_enabled,
@@ -1317,7 +1334,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
                                  bool truncated_timestamps_allowed, bool 
store_schema,
                                  bool compliant_nested_types,
                                  EngineVersion engine_version, bool 
use_threads,
-                                 ::arrow::internal::Executor* executor)
+                                 ::arrow::internal::Executor* executor,
+                                 bool write_time_adjusted_to_utc)
       : write_timestamps_as_int96_(write_nanos_as_int96),
         coerce_timestamps_enabled_(coerce_timestamps_enabled),
         coerce_timestamps_unit_(coerce_timestamps_unit),
@@ -1326,7 +1344,8 @@ class PARQUET_EXPORT ArrowWriterProperties {
         compliant_nested_types_(compliant_nested_types),
         engine_version_(engine_version),
         use_threads_(use_threads),
-        executor_(executor) {}
+        executor_(executor),
+        write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
 
   const bool write_timestamps_as_int96_;
   const bool coerce_timestamps_enabled_;
@@ -1337,6 +1356,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
   const EngineVersion engine_version_;
   const bool use_threads_;
   ::arrow::internal::Executor* executor_;
+  const bool write_time_adjusted_to_utc_;
 };
 
 /// \brief State object used for writing Arrow data directly to a Parquet

Reply via email to