alamb commented on code in PR #7611:
URL: https://github.com/apache/arrow-rs/pull/7611#discussion_r2143611512


##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
             values_builder,
         })
     }
+
+    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder 
with the same
+    /// keys and values, but with a new data type for the keys.
+    ///
+    /// # example
+    /// ```
+    /// # use arrow_array::builder::StringDictionaryBuilder;
+    /// # use arrow_array::types::{UInt8Type, UInt16Type};
+    /// # use arrow_array::UInt16Array;;
+    ///
+    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+    /// u8_keyed_builder.append("def").unwrap();
+    /// u8_keyed_builder.append_null();
+    /// u8_keyed_builder.append("abc").unwrap();
+    ///
+    /// // for some reason, we decide we need to upgrade the key type
+    /// let mut u16_keyed_builder = 
StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
+    /// let dictionary_array = u16_keyed_builder.finish();
+    /// let keys = dictionary_array.keys();
+    ///
+    /// assert_eq!(keys, &UInt16Array::from(vec![Some(0), None, Some(1)]));
+    /// ```
+    pub fn try_new_from_builder<K2>(
+        mut source: GenericByteDictionaryBuilder<K2, T>,
+    ) -> Result<Self, ArrowError>
+    where
+        K::Native: NumCast,
+        K2: ArrowDictionaryKeyType,
+        K2::Native: NumCast,
+    {
+        let state = source.state;
+        let dedup = source.dedup;
+        let values_builder = source.values_builder;
+
+        let source_keys = source.keys_builder.finish();
+        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
+            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {

Review Comment:
   Eventually we could probably make this even faster by special casing the 
types that we know are infallible (e.g. `Int8` keys to `Int16` keys will never 
error. I think the general purpose one now looks good though



##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -614,6 +676,95 @@ mod tests {
         ]);
     }
 
+    fn _test_try_new_from_builder_generic_for_key_types<K1, K2, T>(values: 
Vec<&T::Native>)
+    where
+        K1: ArrowDictionaryKeyType,
+        K1::Native: NumCast,
+        K2: ArrowDictionaryKeyType,
+        K2::Native: NumCast + From<u8>,
+        T: ByteArrayType,
+        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as 
ByteArrayType>::Native>,
+    {
+        let mut source = GenericByteDictionaryBuilder::<K1, T>::new();
+        source.append(values[0]).unwrap();
+        source.append(values[1]).unwrap();
+        source.append_null();
+        source.append(values[2]).unwrap();
+
+        let mut result =
+            GenericByteDictionaryBuilder::<K2, 
T>::try_new_from_builder(source).unwrap();
+        let array = result.finish();
+
+        let mut expected_keys_builder = PrimitiveBuilder::<K2>::new();
+        expected_keys_builder
+            .append_value(<<K2 as ArrowPrimitiveType>::Native as 
From<u8>>::from(0u8));
+        expected_keys_builder
+            .append_value(<<K2 as ArrowPrimitiveType>::Native as 
From<u8>>::from(1u8));
+        expected_keys_builder.append_null();
+        expected_keys_builder
+            .append_value(<<K2 as ArrowPrimitiveType>::Native as 
From<u8>>::from(2u8));
+
+        let av = array.values();
+        let ava: &GenericByteArray<T> = 
av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
+        assert_eq!(ava.value(0), values[0]);
+        assert_eq!(ava.value(1), values[1]);
+        assert_eq!(ava.value(2), values[2]);
+    }
+
+    fn test_try_new_from_builder<T>(values: Vec<&T::Native>)
+    where
+        T: ByteArrayType,
+        <T as ByteArrayType>::Native: PartialEq + AsRef<<T as 
ByteArrayType>::Native>,
+    {
+        // test cast to bigger size unsigned
+        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, 
UInt16Type, T>(
+            values.clone(),
+        );
+        // test cast going to smaller size unsigned
+        _test_try_new_from_builder_generic_for_key_types::<UInt16Type, 
UInt8Type, T>(
+            values.clone(),
+        );
+        // test cast going to bigger size signed
+        _test_try_new_from_builder_generic_for_key_types::<Int8Type, 
Int16Type, T>(values.clone());
+        // test cast going to smaller size signed
+        _test_try_new_from_builder_generic_for_key_types::<Int32Type, 
Int16Type, T>(values.clone());
+        // test going from signed to signed for different size changes
+        _test_try_new_from_builder_generic_for_key_types::<UInt8Type, 
Int16Type, T>(values.clone());
+        _test_try_new_from_builder_generic_for_key_types::<Int8Type, 
UInt8Type, T>(values.clone());
+        _test_try_new_from_builder_generic_for_key_types::<Int8Type, 
UInt16Type, T>(values.clone());
+        _test_try_new_from_builder_generic_for_key_types::<Int32Type, 
Int16Type, T>(values.clone());
+    }
+
+    #[test]
+    fn test_string_dictionary_builder_try_new_from_builder() {
+        test_try_new_from_builder::<GenericStringType<i32>>(vec!["abc", "def", 
"ghi"]);
+    }
+
+    #[test]
+    fn test_binary_dictionary_builder_try_new_from_builder() {
+        test_try_new_from_builder::<GenericBinaryType<i32>>(vec![b"abc", 
b"def", b"ghi"]);
+    }
+
+    #[test]
+    fn test_try_new_from_builder_cast_fails() {

Review Comment:
   love it



##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
             values_builder,
         })
     }
+
+    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder 
with the same
+    /// keys and values, but with a new data type for the keys.
+    ///
+    /// # example

Review Comment:
   ```suggestion
       /// # Example
   ```



##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
             values_builder,
         })
     }
+
+    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder 
with the same
+    /// keys and values, but with a new data type for the keys.
+    ///
+    /// # example
+    /// ```
+    /// # use arrow_array::builder::StringDictionaryBuilder;
+    /// # use arrow_array::types::{UInt8Type, UInt16Type};
+    /// # use arrow_array::UInt16Array;;
+    ///
+    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+    /// u8_keyed_builder.append("def").unwrap();
+    /// u8_keyed_builder.append_null();
+    /// u8_keyed_builder.append("abc").unwrap();
+    ///
+    /// // for some reason, we decide we need to upgrade the key type

Review Comment:
   It might help here to explain / show the main usecase (e.g the builder has 
too many values)
   
   Also when I was poking around, it wasn't clear how a user would get the 
current number of values in the `values_builder` (to know that they needed to 
upgrade the index type) -- should we also add an API for accessing that?



##########
arrow-array/src/builder/generic_bytes_dictionary_builder.rs:
##########
@@ -152,6 +155,65 @@ where
             values_builder,
         })
     }
+
+    /// Creates a new `GenericByteDictionaryBuilder` from the existing builder 
with the same
+    /// keys and values, but with a new data type for the keys.
+    ///
+    /// # example
+    /// ```
+    /// # use arrow_array::builder::StringDictionaryBuilder;
+    /// # use arrow_array::types::{UInt8Type, UInt16Type};
+    /// # use arrow_array::UInt16Array;;
+    ///
+    /// let mut u8_keyed_builder = StringDictionaryBuilder::<UInt8Type>::new();
+    /// u8_keyed_builder.append("def").unwrap();
+    /// u8_keyed_builder.append_null();
+    /// u8_keyed_builder.append("abc").unwrap();
+    ///
+    /// // for some reason, we decide we need to upgrade the key type
+    /// let mut u16_keyed_builder = 
StringDictionaryBuilder::<UInt16Type>::try_new_from_builder(u8_keyed_builder).unwrap();
+    /// let dictionary_array = u16_keyed_builder.finish();
+    /// let keys = dictionary_array.keys();
+    ///
+    /// assert_eq!(keys, &UInt16Array::from(vec![Some(0), None, Some(1)]));
+    /// ```
+    pub fn try_new_from_builder<K2>(
+        mut source: GenericByteDictionaryBuilder<K2, T>,
+    ) -> Result<Self, ArrowError>
+    where
+        K::Native: NumCast,
+        K2: ArrowDictionaryKeyType,
+        K2::Native: NumCast,
+    {
+        let state = source.state;
+        let dedup = source.dedup;
+        let values_builder = source.values_builder;
+
+        let source_keys = source.keys_builder.finish();
+        let new_keys: PrimitiveArray<K> = source_keys.try_unary(|value| {
+            num::cast::cast::<K2::Native, K::Native>(value).ok_or_else(|| {
+                ArrowError::CastError(format!(
+                    "Can't cast dictionary keys from source type {:?} to type 
{:?}",
+                    K2::DATA_TYPE,
+                    K::DATA_TYPE
+                ))
+            })
+        })?;
+
+        // drop source key here because currently source_keys and new_keys are 
holding reference to

Review Comment:
   interesting. makes sense to me



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to