alamb commented on code in PR #3546:
URL: https://github.com/apache/arrow-rs/pull/3546#discussion_r1072929874


##########
arrow-select/src/concat.rs:
##########
@@ -30,24 +30,27 @@
 //! assert_eq!(arr.len(), 3);
 //! ```
 
+use arrow_array::types::*;
 use arrow_array::*;
+use arrow_buffer::ArrowNativeType;
 use arrow_data::transform::{Capacities, MutableArrayData};
-use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType, SchemaRef};
 
-fn compute_str_values_length<Offset: OffsetSizeTrait>(arrays: &[&ArrayData]) 
-> usize {
-    arrays
-        .iter()
-        .map(|&data| {
-            // get the length of the value buffer
-            let buf_len = data.buffers()[1].len();
-            // find the offset of the buffer
-            // this returns a slice of offsets, starting from the offset of 
the array
-            // so we can take the first value
-            let offset = data.buffer::<Offset>(0)[0];
-            buf_len - offset.to_usize().unwrap()
-        })
-        .sum()
+fn binary_capacity<T: ByteArrayType>(arrays: &[&dyn Array]) -> Capacities {
+    let mut item_capacity = 0;
+    let mut bytes_capacity = 0;
+    for array in arrays {
+        let a = array
+            .as_any()
+            .downcast_ref::<GenericByteArray<T>>()
+            .unwrap();
+
+        let offsets = a.value_offsets();
+        bytes_capacity += offsets[offsets.len() - 1].as_usize() - 
offsets[0].as_usize();

Review Comment:
   that is pretty clever. 
   
   Do you need to check that `offsets.len()` > 0 (aka that this is a non zero 
length array)?



##########
arrow-select/src/concat.rs:
##########
@@ -665,4 +651,54 @@ mod tests {
             "Invalid argument error: batches[1] schema is different with 
argument schema.",
         );
     }
+
+    #[test]
+    fn concat_capacity() {
+        let a = Int32Array::from_iter_values(0..100);
+        let b = Int32Array::from_iter_values(10..20);
+        let a = concat(&[&a, &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 440);
+        assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 
64
+
+        let a = concat(&[&a.slice(10, 20), &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 120);
+        assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 
64
+
+        let a = 
StringArray::from_iter_values(std::iter::repeat("foo").take(100));
+        let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]);
+
+        let a = concat(&[&a, &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 420);

Review Comment:
   Is this 420 because it is 100 offsets + 4 --> 104, and then each takes 4 
bytes --> 416 --> round up to multiple if 64?



##########
arrow-select/src/concat.rs:
##########
@@ -665,4 +651,54 @@ mod tests {
             "Invalid argument error: batches[1] schema is different with 
argument schema.",
         );
     }
+
+    #[test]
+    fn concat_capacity() {
+        let a = Int32Array::from_iter_values(0..100);
+        let b = Int32Array::from_iter_values(10..20);
+        let a = concat(&[&a, &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 440);
+        assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 
64
+
+        let a = concat(&[&a.slice(10, 20), &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 120);
+        assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 
64
+
+        let a = 
StringArray::from_iter_values(std::iter::repeat("foo").take(100));
+        let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]);
+
+        let a = concat(&[&a, &b]).unwrap();
+        let data = a.data();
+        assert_eq!(data.buffers()[0].len(), 420);
+        assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 
64
+        assert_eq!(data.buffers()[1].len(), 315);

Review Comment:
   315 = len("foo") + len("bingo") + len("bongo") + len("lorem") + len("") ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to