tustvold commented on code in PR #9077:
URL: https://github.com/apache/arrow-rs/pull/9077#discussion_r2662945038
##########
parquet/src/arrow/array_reader/primitive_array.rs:
##########
@@ -504,6 +200,220 @@ where
}
}
+/// Coerce the parquet physical type array to the target type
+///
+/// This should match the logic in schema::primitive::apply_hint
+fn coerce_array(array: ArrayRef, target_type: &ArrowType) -> Result<ArrayRef> {
+ if let ArrowType::Dictionary(key_type, value_type) = target_type {
+ let dictionary = pack_dictionary(key_type, array.as_ref())?;
+ let any_dictionary = dictionary.as_any_dictionary();
+
+ let coerced_values =
+ coerce_array(Arc::clone(any_dictionary.values()),
value_type.as_ref())?;
+
+ return Ok(any_dictionary.with_values(coerced_values));
+ }
+
+ match array.data_type() {
+ ArrowType::Int32 => coerce_i32(array.as_primitive(), target_type),
+ ArrowType::Int64 => coerce_i64(array.as_primitive(), target_type),
+ ArrowType::Boolean | ArrowType::Float32 | ArrowType::Float64 =>
Ok(array),
+ _ => unreachable!(),
+ }
+}
+
+fn coerce_i32(array: &Int32Array, target_type: &ArrowType) -> Result<ArrayRef>
{
+ Ok(match target_type {
+ ArrowType::UInt8 => {
+ let array = array.unary(|i| i as u8) as UInt8Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int8 => {
+ let array = array.unary(|i| i as i8) as Int8Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::UInt16 => {
+ let array = array.unary(|i| i as u16) as UInt16Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int16 => {
+ let array = array.unary(|i| i as i16) as Int16Array;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Int32 => Arc::new(array.clone()),
+ // follow C++ implementation and use overflow/reinterpret cast from
i32 to u32 which will map
+ // `i32::MIN..0` to `(i32::MAX as u32)..u32::MAX`
+ ArrowType::UInt32 => Arc::new(UInt32Array::new(
+ array.values().inner().clone().into(),
+ array.nulls().cloned(),
+ )) as ArrayRef,
+ ArrowType::Date32 => Arc::new(array.reinterpret_cast::<Date32Type>())
as _,
+ ArrowType::Date64 => {
+ let array: Date64Array = array.unary(|x| x as i64 * 86_400_000);
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Time32(TimeUnit::Second) => {
+ Arc::new(array.reinterpret_cast::<Time32SecondType>()) as ArrayRef
+ }
+ ArrowType::Time32(TimeUnit::Millisecond) => {
+ Arc::new(array.reinterpret_cast::<Time32MillisecondType>()) as
ArrayRef
+ }
+ ArrowType::Timestamp(time_unit, timezone) => match time_unit {
+ TimeUnit::Second => {
+ let array: TimestampSecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Millisecond => {
+ let array: TimestampMillisecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Microsecond => {
+ let array: TimestampMicrosecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ TimeUnit::Nanosecond => {
+ let array: TimestampNanosecondArray = array
+ .unary(|x| x as i64)
+ .with_timezone_opt(timezone.clone());
+ Arc::new(array) as _
+ }
+ },
+ ArrowType::Decimal32(p, s) => {
+ let array = array
+ .reinterpret_cast::<Decimal32Type>()
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal64(p, s) => {
+ let array: Decimal64Array =
+ array.unary(|i| i as i64).with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal128(p, s) => {
+ let array: Decimal128Array = array
+ .unary(|i| i as i128)
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ ArrowType::Decimal256(p, s) => {
+ let array: Decimal256Array = array
+ .unary(|i| i256::from_i128(i as i128))
+ .with_precision_and_scale(*p, *s)?;
+ Arc::new(array) as ArrayRef
+ }
+ _ => unreachable!("Cannot coerce i32 to {target_type}"),
+ })
+}
+
+fn coerce_i64(array: &Int64Array, target_type: &ArrowType) -> Result<ArrayRef>
{
+ Ok(match target_type {
+ ArrowType::Int64 => Arc::new(array.clone()) as _,
+ // follow C++ implementation and use overflow/reinterpret cast from
i64 to u64 which will map
+ // `i64::MIN..0` to `(i64::MAX as u64)..u64::MAX`
+ ArrowType::UInt64 => Arc::new(UInt64Array::new(
+ array.values().inner().clone().into(),
Review Comment:
IIRC into_data still allocates, as the buffers are stored in a Vec
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]