This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f1e1020e6fc [fix](parquet)fix parquet write timestamp int96 type.
(1/2) (#61760)
f1e1020e6fc is described below
commit f1e1020e6fc4e7eaae8b89fe2dbd1ce2a9e1e7da
Author: daidai <[email protected]>
AuthorDate: Sat Mar 28 09:15:03 2026 +0800
[fix](parquet)fix parquet write timestamp int96 type. (1/2) (#61760)
### What problem does this PR solve?
PR #60946
Problem Summary:
This pull request fixes a patch introduced in #60946 that caused Doris
exports to fail to write Parquet int96 data types. This issue is
resolved by adding a new patch to arrow that introduces a parameter that
forces writing to int96.
This pr only update thirdparty, next pr update be code.
---
thirdparty/download-thirdparty.sh | 4 +
...arrow-17.0.0-force-write-int96-timestamps.patch | 98 ++++++++++++++++++++++
2 files changed, 102 insertions(+)
diff --git a/thirdparty/download-thirdparty.sh
b/thirdparty/download-thirdparty.sh
index f57cab9de1a..75ba6313529 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -431,6 +431,10 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
# Paimon-cpp parquet patches: row-group-aware batch reader,
max_row_group_size,
# GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty
fix.
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
+
+ # apache-arrow-17.0.0-force-write-int96-timestamps.patch :
+ # Introducing the parameter that forces writing int96 timestampes
for compatibility with Paimon cpp.
+ patch -p1
<"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch"
touch "${PATCHED_MARK}"
fi
cd -
diff --git
a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
new file mode 100644
index 00000000000..5a754247566
--- /dev/null
+++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch
@@ -0,0 +1,98 @@
+diff -ruN
arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc
arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc
+--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc
2026-03-27 01:23:23.651831424 +0800
++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 2026-03-27
01:28:36.855281965 +0800
+@@ -178,7 +178,8 @@
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+- if (arrow_properties.support_deprecated_int96_timestamps() && target_unit
== ::arrow::TimeUnit::NANO) {
++ if (arrow_properties.force_write_int96_timestamps() ||
++ (arrow_properties.support_deprecated_int96_timestamps() && target_unit
== ::arrow::TimeUnit::NANO)) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h
arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h
+--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h
2026-03-27 01:23:23.643831362 +0800
++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 2026-03-27
01:27:47.717897537 +0800
+@@ -980,6 +980,7 @@
+ public:
+ Builder()
+ : write_timestamps_as_int96_(false),
++ force_write_int96_timestamps_(false),
+ coerce_timestamps_enabled_(false),
+ coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+ truncated_timestamps_allowed_(false),
+@@ -1005,6 +1006,21 @@
+ return this;
+ }
+
++ /// \brief Force writing legacy int96 timestamps.
++ ///
++ /// This bypasses unit-based guards and writes INT96 whenever timestamp
++ /// metadata is resolved.
++ Builder* enable_force_write_int96_timestamps() {
++ force_write_int96_timestamps_ = true;
++ return this;
++ }
++
++ /// \brief Disable forcing legacy int96 timestamps (default).
++ Builder* disable_force_write_int96_timestamps() {
++ force_write_int96_timestamps_ = false;
++ return this;
++ }
++
+ /// \brief Coerce all timestamps to the specified time unit.
+ /// \param unit time unit to truncate to.
+ /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to
microseconds.
+@@ -1085,7 +1101,8 @@
+ /// Create the final properties.
+ std::shared_ptr<ArrowWriterProperties> build() {
+ return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+- write_timestamps_as_int96_, coerce_timestamps_enabled_,
coerce_timestamps_unit_,
++ write_timestamps_as_int96_, force_write_int96_timestamps_,
++ coerce_timestamps_enabled_, coerce_timestamps_unit_,
+ truncated_timestamps_allowed_, store_schema_,
compliant_nested_types_,
+ engine_version_, use_threads_, executor_));
+ }
+@@ -1093,6 +1110,8 @@
+ private:
+ bool write_timestamps_as_int96_;
+
++ bool force_write_int96_timestamps_;
++
+ bool coerce_timestamps_enabled_;
+ ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ bool truncated_timestamps_allowed_;
+@@ -1107,6 +1126,8 @@
+
+ bool support_deprecated_int96_timestamps() const { return
write_timestamps_as_int96_; }
+
++ bool force_write_int96_timestamps() const { return
force_write_int96_timestamps_; }
++
+ bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_;
}
+ ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+ return coerce_timestamps_unit_;
+@@ -1138,6 +1159,7 @@
+
+ private:
+ explicit ArrowWriterProperties(bool write_nanos_as_int96,
++ bool force_write_int96_timestamps,
+ bool coerce_timestamps_enabled,
+ ::arrow::TimeUnit::type
coerce_timestamps_unit,
+ bool truncated_timestamps_allowed, bool
store_schema,
+@@ -1145,6 +1167,7 @@
+ EngineVersion engine_version, bool
use_threads,
+ ::arrow::internal::Executor* executor)
+ : write_timestamps_as_int96_(write_nanos_as_int96),
++ force_write_int96_timestamps_(force_write_int96_timestamps),
+ coerce_timestamps_enabled_(coerce_timestamps_enabled),
+ coerce_timestamps_unit_(coerce_timestamps_unit),
+ truncated_timestamps_allowed_(truncated_timestamps_allowed),
+@@ -1155,6 +1178,7 @@
+ executor_(executor) {}
+
+ const bool write_timestamps_as_int96_;
++ const bool force_write_int96_timestamps_;
+ const bool coerce_timestamps_enabled_;
+ const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ const bool truncated_timestamps_allowed_;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]