alamb commented on code in PR #7611:
URL: https://github.com/apache/arrow-rs/pull/7611#discussion_r2143611512
##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
values_builder,
})
}
+
+ /// Creates a new `GenericByteDictionaryBuilder` from the existing builder
with the same
+ /// keys and values, but with a new data type for the keys.
+ ///
+ /// # example
+ /// ```
+ /// # use arrow_array::builder::StringDictionaryBuilder;
+ /// # use arrow_array::types::{UInt8Type, UInt16Type};
+ /// # use arrow_array::UInt16Array;;
+ ///
+ /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+ /// u8_keyed_builder.append("def").unwrap();
+ /// u8_keyed_builder.append_null();
+ /// u8_keyed_builder.append("abc").unwrap();
+ ///
+ /// // for some reason, we decide we need to upgrade the key type
+ /// let mut u16_keyed_builder =
StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
+ /// let dictionary_array = u16_keyed_builder.finish();
+ /// let keys = dictionary_array.keys();
+ ///
+ /// assert_eq!(keys, &UInt16Array::from(vec![Some(0), None, Some(1)]));
+ /// ```
+ pub fn try_new_from_builder<K2>(
+ mut source: GenericByteDictionaryBuilder<K2, T>,
+ ) -> Result<Self, ArrowError>
+ where
+ K::Native: NumCast,
+ K2: ArrowDictionaryKeyType,
+ K2::Native: NumCast,
+ {
+ let state = source.state;
+ let dedup = source.dedup;
+ let values_builder = source.values_builder;
+
+ let source_keys = source.keys_builder.finish();
+ let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
+ num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
Review Comment:
Eventually we could probably make this even faster by special casing the
types that we know are infallible (e.g. `Int8` keys to `Int16` keys will never
error. I think the general purpose one now looks good though
##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -614,6 +676,95 @@ mod tests {
]);
}
+ fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values:
Vec<&T::Native>)
+ where
+ K1: ArrowDictionaryKeyType,
+ K1::Native: NumCast,
+ K2: ArrowDictionaryKeyType,
+ K2::Native: NumCast + From<u8>,
+ T: ByteArrayType,
+ <T as ByteArrayType>::Native: PartialEq + AsRef<<T as
ByteArrayType>::Native>,
+ {
+ let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
+ source.append(values[0]).unwrap();
+ source.append(values[1]).unwrap();
+ source.append_null();
+ source.append(values[2]).unwrap();
+
+ let mut result =
+ GenericByteDictionaryBuilder::<K2,
T>::try_new_from_builder(source).unwrap();
+ let array = result.finish();
+
+ let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
+ expected_keys_builder
+ .append_value(<<K2 as ArrowPrimitiveType>::Native as
From<u8>>::from(0u8));
+ expected_keys_builder
+ .append_value(<<K2 as ArrowPrimitiveType>::Native as
From<u8>>::from(1u8));
+ expected_keys_builder.append_null();
+ expected_keys_builder
+ .append_value(<<K2 as ArrowPrimitiveType>::Native as
From<u8>>::from(2u8));
+
+ let av = array.values();
+ let ava: &GenericByteArray<T> =
av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
+ assert_eq!(ava.value(0), values[0]);
+ assert_eq!(ava.value(1), values[1]);
+ assert_eq!(ava.value(2), values[2]);
+ }
+
+ fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
+ where
+ T: ByteArrayType,
+ <T as ByteArrayType>::Native: PartialEq + AsRef<<T as
ByteArrayType>::Native>,
+ {
+ // test cast to bigger size unsigned
+ _test_try_new_from_builder_generic_for_key_types::<UInt8Type,
UInt16Type, T>(
+ values.clone(),
+ );
+ // test cast going to smaller size unsigned
+ _test_try_new_from_builder_generic_for_key_types::<UInt16Type,
UInt8Type, T>(
+ values.clone(),
+ );
+ // test cast going to bigger size signed
+ _test_try_new_from_builder_generic_for_key_types::<Int8Type,
Int16Type, T>(values.clone());
+ // test cast going to smaller size signed
+ _test_try_new_from_builder_generic_for_key_types::<Int32Type,
Int16Type, T>(values.clone());
+ // test going from signed to signed for different size changes
+ _test_try_new_from_builder_generic_for_key_types::<UInt8Type,
Int16Type, T>(values.clone());
+ _test_try_new_from_builder_generic_for_key_types::<Int8Type,
UInt8Type, T>(values.clone());
+ _test_try_new_from_builder_generic_for_key_types::<Int8Type,
UInt16Type, T>(values.clone());
+ _test_try_new_from_builder_generic_for_key_types::<Int32Type,
Int16Type, T>(values.clone());
+ }
+
+ #[test]
+ fn test_string_dictionary_builder_try_new_from_builder() {
+ test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def",
"ghi"]);
+ }
+
+ #[test]
+ fn test_binary_dictionary_builder_try_new_from_builder() {
+ test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc",
b"def", b"ghi"]);
+ }
+
+ #[test]
+ fn test_try_new_from_builder_cast_fails() {
Review Comment:
love it
##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
values_builder,
})
}
+
+ /// Creates a new `GenericByteDictionaryBuilder` from the existing builder
with the same
+ /// keys and values, but with a new data type for the keys.
+ ///
+ /// # example
Review Comment:
```suggestion
/// # Example
```
##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
values_builder,
})
}
+
+ /// Creates a new `GenericByteDictionaryBuilder` from the existing builder
with the same
+ /// keys and values, but with a new data type for the keys.
+ ///
+ /// # example
+ /// ```
+ /// # use arrow_array::builder::StringDictionaryBuilder;
+ /// # use arrow_array::types::{UInt8Type, UInt16Type};
+ /// # use arrow_array::UInt16Array;;
+ ///
+ /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+ /// u8_keyed_builder.append("def").unwrap();
+ /// u8_keyed_builder.append_null();
+ /// u8_keyed_builder.append("abc").unwrap();
+ ///
+ /// // for some reason, we decide we need to upgrade the key type
Review Comment:
It might help here to explain / show the main usecase (e.g the builder has
too many values)
Also when I was poking around, it wasn't clear how a user would get the
current number of values in the `values_builder` (to know that they needed to
upgrade the index type) -- should we also add an API for accessing that?
##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
values_builder,
})
}
+
+ /// Creates a new `GenericByteDictionaryBuilder` from the existing builder
with the same
+ /// keys and values, but with a new data type for the keys.
+ ///
+ /// # example
+ /// ```
+ /// # use arrow_array::builder::StringDictionaryBuilder;
+ /// # use arrow_array::types::{UInt8Type, UInt16Type};
+ /// # use arrow_array::UInt16Array;;
+ ///
+ /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+ /// u8_keyed_builder.append("def").unwrap();
+ /// u8_keyed_builder.append_null();
+ /// u8_keyed_builder.append("abc").unwrap();
+ ///
+ /// // for some reason, we decide we need to upgrade the key type
+ /// let mut u16_keyed_builder =
StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
+ /// let dictionary_array = u16_keyed_builder.finish();
+ /// let keys = dictionary_array.keys();
+ ///
+ /// assert_eq!(keys, &UInt16Array::from(vec![Some(0), None, Some(1)]));
+ /// ```
+ pub fn try_new_from_builder<K2>(
+ mut source: GenericByteDictionaryBuilder<K2, T>,
+ ) -> Result<Self, ArrowError>
+ where
+ K::Native: NumCast,
+ K2: ArrowDictionaryKeyType,
+ K2::Native: NumCast,
+ {
+ let state = source.state;
+ let dedup = source.dedup;
+ let values_builder = source.values_builder;
+
+ let source_keys = source.keys_builder.finish();
+ let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
+ num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Can't cast dictionary keys from source type {:?} to type
{:?}",
+ K2::DATA_TYPE,
+ K::DATA_TYPE
+ ))
+ })
+ })?;
+
+ // drop source key here because currently source_keys and new_keys are
holding reference to
Review Comment:
interesting. makes sense to me
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]