alamb commented on code in PR #3546: URL: https://github.com/apache/arrow-rs/pull/3546#discussion_r1072929874
########## arrow-select/src/concat.rs: ########## @@ -30,24 +30,27 @@ //! assert_eq!(arr.len(), 3); //! ``` +use arrow_array::types::*; use arrow_array::*; +use arrow_buffer::ArrowNativeType; use arrow_data::transform::{Capacities, MutableArrayData}; -use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType, SchemaRef}; -fn compute_str_values_length<Offset: OffsetSizeTrait>(arrays: &[&ArrayData]) -> usize { - arrays - .iter() - .map(|&data| { - // get the length of the value buffer - let buf_len = data.buffers()[1].len(); - // find the offset of the buffer - // this returns a slice of offsets, starting from the offset of the array - // so we can take the first value - let offset = data.buffer::<Offset>(0)[0]; - buf_len - offset.to_usize().unwrap() - }) - .sum() +fn binary_capacity<T: ByteArrayType>(arrays: &[&dyn Array]) -> Capacities { + let mut item_capacity = 0; + let mut bytes_capacity = 0; + for array in arrays { + let a = array + .as_any() + .downcast_ref::<GenericByteArray<T>>() + .unwrap(); + + let offsets = a.value_offsets(); + bytes_capacity += offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize(); Review Comment: that is pretty clever. Do you need to check that `offsets.len()` > 0 (aka that this is a non zero length array)? ########## arrow-select/src/concat.rs: ########## @@ -665,4 +651,54 @@ mod tests { "Invalid argument error: batches[1] schema is different with argument schema.", ); } + + #[test] + fn concat_capacity() { + let a = Int32Array::from_iter_values(0..100); + let b = Int32Array::from_iter_values(10..20); + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 440); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + + let a = concat(&[&a.slice(10, 20), &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 120); + assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 64 + + let a = StringArray::from_iter_values(std::iter::repeat("foo").take(100)); + let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]); + + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 420); Review Comment: Is this 420 because it is 100 offsets + 4 --> 104, and then each takes 4 bytes --> 416 --> round up to multiple if 64? ########## arrow-select/src/concat.rs: ########## @@ -665,4 +651,54 @@ mod tests { "Invalid argument error: batches[1] schema is different with argument schema.", ); } + + #[test] + fn concat_capacity() { + let a = Int32Array::from_iter_values(0..100); + let b = Int32Array::from_iter_values(10..20); + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 440); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + + let a = concat(&[&a.slice(10, 20), &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 120); + assert_eq!(data.buffers()[0].capacity(), 128); // Nearest multiple of 64 + + let a = StringArray::from_iter_values(std::iter::repeat("foo").take(100)); + let b = StringArray::from(vec!["bingo", "bongo", "lorem", ""]); + + let a = concat(&[&a, &b]).unwrap(); + let data = a.data(); + assert_eq!(data.buffers()[0].len(), 420); + assert_eq!(data.buffers()[0].capacity(), 448); // Nearest multiple of 64 + assert_eq!(data.buffers()[1].len(), 315); Review Comment: 315 = len("foo") + len("bingo") + len("bongo") + len("lorem") + len("") ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org