This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b38c731ec7 fix: write hive partitions for any int/uint/float (#15337)
b38c731ec7 is described below
commit b38c731ec7bde6794322edf36d20855481192191
Author: christophermcdermott <[email protected]>
AuthorDate: Mon Mar 24 15:02:51 2025 -0400
fix: write hive partitions for any int/uint/float (#15337)
Co-authored-by: christophermcdermott <masked>
---
datafusion/datasource/src/write/demux.rs | 60 ++++++++++++++++++++++++++++-
datafusion/sqllogictest/test_files/copy.slt | 34 ++++++++++------
2 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/datafusion/datasource/src/write/demux.rs
b/datafusion/datasource/src/write/demux.rs
index 111d22060c..fc2e5daf92 100644
--- a/datafusion/datasource/src/write/demux.rs
+++ b/datafusion/datasource/src/write/demux.rs
@@ -33,8 +33,10 @@ use arrow::array::{
};
use arrow::datatypes::{DataType, Schema};
use datafusion_common::cast::{
- as_boolean_array, as_date32_array, as_date64_array, as_int32_array,
as_int64_array,
- as_string_array, as_string_view_array,
+ as_boolean_array, as_date32_array, as_date64_array, as_float16_array,
+ as_float32_array, as_float64_array, as_int16_array, as_int32_array,
as_int64_array,
+ as_int8_array, as_string_array, as_string_view_array, as_uint16_array,
+ as_uint32_array, as_uint64_array, as_uint8_array,
};
use datafusion_common::{exec_datafusion_err, not_impl_err, DataFusionError};
use datafusion_common_runtime::SpawnedTask;
@@ -407,6 +409,18 @@ fn compute_partition_keys_by_row<'a>(
partition_values.push(Cow::from(date));
}
}
+ DataType::Int8 => {
+ let array = as_int8_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::Int16 => {
+ let array = as_int16_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
DataType::Int32 => {
let array = as_int32_array(col_array)?;
for i in 0..rb.num_rows() {
@@ -419,6 +433,48 @@ fn compute_partition_keys_by_row<'a>(
partition_values.push(Cow::from(array.value(i).to_string()));
}
}
+ DataType::UInt8 => {
+ let array = as_uint8_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::UInt16 => {
+ let array = as_uint16_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::UInt32 => {
+ let array = as_uint32_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::UInt64 => {
+ let array = as_uint64_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::Float16 => {
+ let array = as_float16_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::Float32 => {
+ let array = as_float32_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
+ DataType::Float64 => {
+ let array = as_float64_array(col_array)?;
+ for i in 0..rb.num_rows() {
+
partition_values.push(Cow::from(array.value(i).to_string()));
+ }
+ }
DataType::Dictionary(_, _) => {
downcast_dictionary_array!(
col_array => {
diff --git a/datafusion/sqllogictest/test_files/copy.slt
b/datafusion/sqllogictest/test_files/copy.slt
index e2bb23e357..925f96bd4a 100644
--- a/datafusion/sqllogictest/test_files/copy.slt
+++ b/datafusion/sqllogictest/test_files/copy.slt
@@ -110,24 +110,36 @@ a
# Copy to directory as partitioned files
query I
-COPY (values (1::int, 2::bigint, 19968::date, arrow_cast(1725235200000,
'Date64'), false, 'x'),
- (11::int, 22::bigint, 19969::date, arrow_cast(1725148800000, 'Date64'),
true, 'y')
+COPY (values (arrow_cast(1, 'Int8'), arrow_cast(2, 'UInt8'), arrow_cast(3,
'Int16'), arrow_cast(4, 'UInt16'),
+ arrow_cast(5, 'Int32'), arrow_cast(6, 'UInt32'), arrow_cast(7,
'Int64'), arrow_cast(8, 'UInt64'),
+ arrow_cast(9.1015625, 'Float16'), arrow_cast(10.1, 'Float32'),
arrow_cast(11.1, 'Float64'), 19968::date,
+ arrow_cast(1725235200000, 'Date64'), false, 'x'),
+ (arrow_cast(11, 'Int8'), arrow_cast(22, 'UInt8'), arrow_cast(33, 'Int16'),
arrow_cast(44, 'UInt16'),
+ arrow_cast(55, 'Int32'), arrow_cast(66, 'UInt32'), arrow_cast(77,
'Int64'), arrow_cast(88, 'UInt64'),
+ arrow_cast(9.203125, 'Float16'), arrow_cast(10.2, 'Float32'),
arrow_cast(11.2, 'Float64'), 19969::date,
+ arrow_cast(1725148800000, 'Date64'), true, 'y')
)
-TO 'test_files/scratch/copy/partitioned_table5/' STORED AS parquet PARTITIONED
BY (column1, column2, column3, column4, column5)
+TO 'test_files/scratch/copy/partitioned_table5/' STORED AS parquet PARTITIONED
BY (column1, column2, column3, column4,
+ column5, column6, column7, column8, column9, column10, column11, column12,
column13, column14)
OPTIONS ('format.compression' 'zstd(10)');
----
2
# validate partitioning
statement ok
-CREATE EXTERNAL TABLE validate_partitioned_parquet5 (column1 int, column2
bigint, column3 date, column4 date, column5 boolean, column6 varchar) STORED AS
PARQUET
-LOCATION 'test_files/scratch/copy/partitioned_table5/' PARTITIONED BY
(column1, column2, column3, column4, column5);
-
-query IIDDBT
-select column1, column2, column3, column4, column5, column6 from
validate_partitioned_parquet5 order by column1,column2,column3,column4,column5;
-----
-1 2 2024-09-02 2024-09-02 false x
-11 22 2024-09-03 2024-09-01 true y
+CREATE EXTERNAL TABLE validate_partitioned_parquet5 (column1 int, column2 int,
column3 int, column4 int, column5 int,
+ column6 int, column7 bigint, column8 bigint, column9 float, column10
float, column11 float, column12 date,
+ column13 date, column14 boolean, column15 varchar) STORED AS PARQUET
+LOCATION 'test_files/scratch/copy/partitioned_table5/' PARTITIONED BY
(column1, column2, column3, column4, column5,
+ column6, column7, column8, column9, column10, column11, column12,
column13, column14);
+
+query IIIIIIIIRRRDDBT
+select column1, column2, column3, column4, column5, column6, column7, column8,
column9, column10, column11, column12,
+ column13, column14, column15 from validate_partitioned_parquet5 order by
column1, column2, column3, column4,
+ column5, column6, column7, column8, column9, column10, column11, column12,
column13;
+----
+1 2 3 4 5 6 7 8 9.1015625 10.1 11.1 2024-09-02 2024-09-02 false x
+11 22 33 44 55 66 77 88 9.203125 10.2 11.2 2024-09-03 2024-09-01 true y
statement ok
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]