alamb commented on code in PR #3244:
URL: https://github.com/apache/arrow-rs/pull/3244#discussion_r1038991721
##########
arrow-array/src/array/struct_array.rs:
##########
@@ -568,7 +562,9 @@ mod tests {
}
#[test]
- #[should_panic(expected = "non-nullable field cannot have null values")]
+ #[should_panic(
+ expected = "non-nullable child of type Int32 contains nulls not
present in parent Struct"
Review Comment:
👍
##########
arrow-array/src/array/mod.rs:
##########
@@ -916,7 +916,7 @@ mod tests {
#[test]
fn test_null_struct() {
let struct_type =
- DataType::Struct(vec![Field::new("data", DataType::Int64, true)]);
+ DataType::Struct(vec![Field::new("data", DataType::Int64, false)]);
Review Comment:
```suggestion
// It is possible to create a null struct containing a
non-nullable child 🤯
// see https://github.com/apache/arrow-rs/pull/3244 for details
DataType::Struct(vec![Field::new("data", DataType::Int64,
false)]);
```
It might be a good idea to leave some rationale for this very non obvious
corner case
##########
arrow-array/src/array/string_array.rs:
##########
@@ -609,7 +609,7 @@ mod tests {
let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Box::new(
- Field::new("item", DataType::UInt8, false),
+ Field::new("item", DataType::UInt8, true),
Review Comment:
```suggestion
// It is possible to create a null struct containing a
non-nullable child 🤯
// see https://github.com/apache/arrow-rs/pull/3244 for details
Field::new("item", DataType::UInt8, true),
```
##########
arrow-data/src/data.rs:
##########
@@ -1012,9 +1020,100 @@ impl ArrayData {
self.null_count, actual_null_count
)));
}
+
+ // In general non-nullable children should not contain nulls, however,
for certain
+ // types, such as StructArray and FixedSizeList, nulls in the parent
take up
+ // space in the child. As such we permit nulls in the children in the
corresponding
+ // positions for such types
+ match &self.data_type {
+ DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _)
=> {
+ if !f.is_nullable() {
+ self.validate_non_nullable(None, 0, &self.child_data[0])?
+ }
+ }
+ DataType::FixedSizeList(field, len) => {
+ let child = &self.child_data[0];
+ if !field.is_nullable() {
+ match nulls {
+ Some(nulls) => {
+ let element_len = *len as usize;
+ let mut buffer =
+ MutableBuffer::new_null(element_len *
self.len);
+
+ for i in 0..self.len {
+ if !bit_util::get_bit(nulls.as_ref(),
self.offset + i) {
+ continue;
+ }
+ for j in 0..element_len {
+ bit_util::set_bit(
+ buffer.as_mut(),
+ i * element_len + j,
+ )
+ }
+ }
+ let mask = buffer.into();
+ self.validate_non_nullable(Some(&mask), 0, child)?;
Review Comment:
I think some comments about this mask (what it is and why it is built) might
be helpful
##########
arrow-data/src/data.rs:
##########
@@ -1012,9 +1020,100 @@ impl ArrayData {
self.null_count, actual_null_count
)));
}
+
+ // In general non-nullable children should not contain nulls, however,
for certain
+ // types, such as StructArray and FixedSizeList, nulls in the parent
take up
+ // space in the child. As such we permit nulls in the children in the
corresponding
+ // positions for such types
+ match &self.data_type {
+ DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _)
=> {
+ if !f.is_nullable() {
+ self.validate_non_nullable(None, 0, &self.child_data[0])?
+ }
+ }
+ DataType::FixedSizeList(field, len) => {
+ let child = &self.child_data[0];
+ if !field.is_nullable() {
+ match nulls {
+ Some(nulls) => {
+ let element_len = *len as usize;
+ let mut buffer =
+ MutableBuffer::new_null(element_len *
self.len);
+
+ for i in 0..self.len {
+ if !bit_util::get_bit(nulls.as_ref(),
self.offset + i) {
+ continue;
+ }
+ for j in 0..element_len {
+ bit_util::set_bit(
+ buffer.as_mut(),
+ i * element_len + j,
+ )
+ }
+ }
+ let mask = buffer.into();
+ self.validate_non_nullable(Some(&mask), 0, child)?;
+ }
+ None => self.validate_non_nullable(None, 0, child)?,
+ }
+ }
+ }
+ DataType::Struct(fields) => {
+ for (field, child) in fields.iter().zip(&self.child_data) {
+ if !field.is_nullable() {
+ self.validate_non_nullable(nulls, self.offset, child)?
+ }
+ }
+ }
+ _ => {}
+ }
+
Ok(())
}
+ /// Verifies that `child` contains no nulls not present in `mask`
+ fn validate_non_nullable(
+ &self,
+ mask: Option<&Buffer>,
+ offset: usize,
+ data: &ArrayData,
+ ) -> Result<(), ArrowError> {
+ let mask = match mask {
+ Some(mask) => mask.as_ref(),
+ None => return match data.null_count {
+ 0 => Ok(()),
+ _ => Err(ArrowError::InvalidArgumentError(format!(
+ "non-nullable child of type {} contains nulls not present
in parent {}",
Review Comment:
I suggest making these two messages different so it is easier to find what
error is being hit by `grep` if someone hits the error
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]