alamb commented on code in PR #9058:
URL: https://github.com/apache/arrow-rs/pull/9058#discussion_r2651798669


##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -941,15 +941,15 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a 
GenericByteViewArray<T>
 }
 
 impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
-    fn from(value: ArrayData) -> Self {
-        let views = value.buffers()[0].clone();
-        let views = ScalarBuffer::new(views, value.offset(), value.len());
-        let buffers = value.buffers()[1..].to_vec();

Review Comment:
   this call to `to_vec()` creates a new vec (aka is a new allocation) while 
using the new API reuses the existing allocation
   
   I am pretty sure I was seeing this allocation show up in my profiling of the 
parquet reader for clickbench



##########
arrow-array/src/array/boolean_array.rs:
##########
@@ -387,24 +387,21 @@ impl From<Vec<Option<bool>>> for BooleanArray {
 
 impl From<ArrayData> for BooleanArray {
     fn from(data: ArrayData) -> Self {
+        let (data_type, len, nulls, offset, mut buffers, _child_data) = 
data.into_parts();
         assert_eq!(
-            data.data_type(),
-            &DataType::Boolean,
-            "BooleanArray expected ArrayData with type {} got {}",
+            data_type,
             DataType::Boolean,
-            data.data_type()
+            "BooleanArray expected ArrayData with type Boolean got 
{data_type:?}",
         );
         assert_eq!(
-            data.buffers().len(),
+            buffers.len(),
             1,
             "BooleanArray data should contain a single buffer only (values 
buffer)"
         );
-        let values = BooleanBuffer::new(data.buffers()[0].clone(), 
data.offset(), data.len());
+        let buffer = buffers.pop().expect("checked above");

Review Comment:
   this basically shows the new pattern -- rather than cloning parts of 
`ArrayData`, instead simply get the relevant structures that are needed 
directly as it already gets an owned `ArrayData`
   
   For BooleanArray this probably saves some `Arc::clone`s which isn't a big 
deal. For more complex types like StructArray and GenericByteViewArray it also 
saves some Vec allocations which I do think is a bigger deal



##########
arrow-array/src/array/dictionary_array.rs:
##########
@@ -603,17 +606,17 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for 
DictionaryArray<T> {
                 key_data_type
             );
 
-            let values = make_array(data.child_data()[0].clone());

Review Comment:
   here is an example where the entire child_data `ArrayData` was cloned (this 
potentially includes two `Vec`s )



##########
arrow-array/src/array/list_array.rs:
##########
@@ -479,23 +479,26 @@ impl<OffsetSize: OffsetSizeTrait> 
From<FixedSizeListArray> for GenericListArray<
 
 impl<OffsetSize: OffsetSizeTrait> GenericListArray<OffsetSize> {
     fn try_new_from_array_data(data: ArrayData) -> Result<Self, ArrowError> {
-        if data.buffers().len() != 1 {
+        let (data_type, len, nulls, offset, mut buffers, mut child_data) = 
data.into_parts();
+
+        if buffers.len() != 1 {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "ListArray data should contain a single buffer only (value 
offsets), had {}",
-                data.buffers().len()
+                buffers.len()
             )));
         }
+        let buffer = buffers.pop().expect("checked above");
 
-        if data.child_data().len() != 1 {
+        if child_data.len() != 1 {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "ListArray should contain a single child array (values array), 
had {}",
-                data.child_data().len()
+                child_data.len()
             )));
         }
 
-        let values = data.child_data()[0].clone();

Review Comment:
   here is another example of not having to clone (and allocate `Vec`s) an 
entire `ArrayData`



##########
arrow-data/src/data.rs:
##########
@@ -351,6 +354,33 @@ impl ArrayData {
         Ok(new_self)
     }
 
+    /// Return the constituent parts of this ArrayData

Review Comment:
   Here is the key new API in ArrayData to return all its fields



##########
arrow-array/src/array/struct_array.rs:
##########
@@ -347,25 +347,26 @@ impl StructArray {
 
 impl From<ArrayData> for StructArray {
     fn from(data: ArrayData) -> Self {
-        let parent_offset = data.offset();
-        let parent_len = data.len();
+        let (data_type, len, nulls, offset, _buffers, child_data) = 
data.into_parts();
 
-        let fields = data
-            .child_data()
-            .iter()
+        let parent_offset = offset;
+        let parent_len = len;
+
+        let fields = child_data
+            .into_iter()
             .map(|cd| {
                 if parent_offset != 0 || parent_len != cd.len() {
                     make_array(cd.slice(parent_offset, parent_len))
                 } else {
-                    make_array(cd.clone())

Review Comment:
   this clone in particular is called for every batch created by the Parquet 
reader -- and each field (aka parquet column) likely has at least one Vec (the 
buffers)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to