alamb commented on code in PR #14504:
URL: https://github.com/apache/datafusion/pull/14504#discussion_r1945294378


##########
datafusion/physical-plan/src/aggregates/group_values/null_builder.rs:
##########
@@ -15,120 +15,78 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::BooleanBufferBuilder;
+use arrow::array::NullBufferBuilder;
 use arrow::buffer::NullBuffer;
 
 /// Builder for an (optional) null mask
 ///
 /// Optimized for avoid creating the bitmask when all values are non-null
 #[derive(Debug)]
-pub(crate) enum MaybeNullBufferBuilder {
-    ///  seen `row_count` rows but no nulls yet
-    NoNulls { row_count: usize },
-    /// have at least one null value
-    ///
+pub(crate) struct MaybeNullBufferBuilder {
     /// Note this is an Arrow *VALIDITY* buffer (so it is false for nulls, true
     /// for non-nulls)
-    Nulls(BooleanBufferBuilder),
+    nulls: NullBufferBuilder,
 }
 
 impl MaybeNullBufferBuilder {
     /// Create a new builder
     pub fn new() -> Self {
-        Self::NoNulls { row_count: 0 }
+        Self {
+            nulls: NullBufferBuilder::new(0),
+        }
     }
 
     /// Return true if the row at index `row` is null
     pub fn is_null(&self, row: usize) -> bool {
-        match self {
-            Self::NoNulls { .. } => false,
+        match self.nulls.as_slice() {
             // validity mask means a unset bit is NULL
-            Self::Nulls(builder) => !builder.get_bit(row),
+            Some(_) => !self.nulls.is_valid(row),
+            None => false,
         }
     }
 
     /// Set the nullness of the next row to `is_null`
     ///
-    /// num_values is the current length of the rows being tracked
-    ///
     /// If `value` is true, the row is null.
     /// If `value` is false, the row is non null
     pub fn append(&mut self, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append(false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += 1;
-            }
-            Self::Nulls(builder) => builder.append(!is_null),
-        }
+        self.nulls.append(!is_null)
     }
 
     pub fn append_n(&mut self, n: usize, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append_n(n, false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += n;
-            }
-            Self::Nulls(builder) => builder.append_n(n, !is_null),
+        if is_null {
+            self.nulls.append_n_nulls(n);
+        } else {
+            self.nulls.append_n_non_nulls(n);
         }
     }
 
     /// return the number of heap allocated bytes used by this structure to 
store boolean values
     pub fn allocated_size(&self) -> usize {
-        match self {
-            Self::NoNulls { .. } => 0,
-            // BooleanBufferBuilder builder::capacity returns capacity in bits 
(not bytes)
-            Self::Nulls(builder) => builder.capacity() / 8,
-        }
+        // NullBufferBuilder builder::allocated_size returns capacity in bits

Review Comment:
   Indeed -- I verified this is the case and added some tests here:
   - https://github.com/apache/arrow-rs/pull/7089



##########
datafusion/physical-plan/src/aggregates/group_values/null_builder.rs:
##########
@@ -15,120 +15,78 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::BooleanBufferBuilder;
+use arrow::array::NullBufferBuilder;
 use arrow::buffer::NullBuffer;
 
 /// Builder for an (optional) null mask
 ///
 /// Optimized for avoid creating the bitmask when all values are non-null
 #[derive(Debug)]
-pub(crate) enum MaybeNullBufferBuilder {
-    ///  seen `row_count` rows but no nulls yet
-    NoNulls { row_count: usize },
-    /// have at least one null value
-    ///
+pub(crate) struct MaybeNullBufferBuilder {
     /// Note this is an Arrow *VALIDITY* buffer (so it is false for nulls, true
     /// for non-nulls)
-    Nulls(BooleanBufferBuilder),
+    nulls: NullBufferBuilder,
 }
 
 impl MaybeNullBufferBuilder {
     /// Create a new builder
     pub fn new() -> Self {
-        Self::NoNulls { row_count: 0 }
+        Self {
+            nulls: NullBufferBuilder::new(0),
+        }
     }
 
     /// Return true if the row at index `row` is null
     pub fn is_null(&self, row: usize) -> bool {
-        match self {
-            Self::NoNulls { .. } => false,
+        match self.nulls.as_slice() {
             // validity mask means a unset bit is NULL
-            Self::Nulls(builder) => !builder.get_bit(row),
+            Some(_) => !self.nulls.is_valid(row),
+            None => false,
         }
     }
 
     /// Set the nullness of the next row to `is_null`
     ///
-    /// num_values is the current length of the rows being tracked
-    ///
     /// If `value` is true, the row is null.
     /// If `value` is false, the row is non null
     pub fn append(&mut self, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append(false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += 1;
-            }
-            Self::Nulls(builder) => builder.append(!is_null),
-        }
+        self.nulls.append(!is_null)
     }
 
     pub fn append_n(&mut self, n: usize, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append_n(n, false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += n;
-            }
-            Self::Nulls(builder) => builder.append_n(n, !is_null),
+        if is_null {
+            self.nulls.append_n_nulls(n);
+        } else {
+            self.nulls.append_n_non_nulls(n);
         }
     }
 
     /// return the number of heap allocated bytes used by this structure to 
store boolean values
     pub fn allocated_size(&self) -> usize {
-        match self {
-            Self::NoNulls { .. } => 0,
-            // BooleanBufferBuilder builder::capacity returns capacity in bits 
(not bytes)
-            Self::Nulls(builder) => builder.capacity() / 8,
-        }
+        // NullBufferBuilder builder::allocated_size returns capacity in bits

Review Comment:
   I actually looked into it more -- I actually think `allocated_size` returns 
8* the actual capacity



##########
datafusion/physical-plan/src/aggregates/group_values/null_builder.rs:
##########
@@ -15,120 +15,78 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::BooleanBufferBuilder;
+use arrow::array::NullBufferBuilder;
 use arrow::buffer::NullBuffer;
 
 /// Builder for an (optional) null mask
 ///
 /// Optimized for avoid creating the bitmask when all values are non-null
 #[derive(Debug)]
-pub(crate) enum MaybeNullBufferBuilder {
-    ///  seen `row_count` rows but no nulls yet
-    NoNulls { row_count: usize },
-    /// have at least one null value
-    ///
+pub(crate) struct MaybeNullBufferBuilder {
     /// Note this is an Arrow *VALIDITY* buffer (so it is false for nulls, true
     /// for non-nulls)
-    Nulls(BooleanBufferBuilder),
+    nulls: NullBufferBuilder,
 }
 
 impl MaybeNullBufferBuilder {
     /// Create a new builder
     pub fn new() -> Self {
-        Self::NoNulls { row_count: 0 }
+        Self {
+            nulls: NullBufferBuilder::new(0),
+        }
     }
 
     /// Return true if the row at index `row` is null
     pub fn is_null(&self, row: usize) -> bool {
-        match self {
-            Self::NoNulls { .. } => false,
+        match self.nulls.as_slice() {
             // validity mask means a unset bit is NULL
-            Self::Nulls(builder) => !builder.get_bit(row),
+            Some(_) => !self.nulls.is_valid(row),
+            None => false,
         }
     }
 
     /// Set the nullness of the next row to `is_null`
     ///
-    /// num_values is the current length of the rows being tracked
-    ///
     /// If `value` is true, the row is null.
     /// If `value` is false, the row is non null
     pub fn append(&mut self, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append(false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += 1;
-            }
-            Self::Nulls(builder) => builder.append(!is_null),
-        }
+        self.nulls.append(!is_null)
     }
 
     pub fn append_n(&mut self, n: usize, is_null: bool) {
-        match self {
-            Self::NoNulls { row_count } if is_null => {
-                // have seen no nulls so far, this is the  first null,
-                // need to create the nulls buffer for all currently valid 
values
-                // alloc 2x the need given we push a new but immediately
-                let mut nulls = BooleanBufferBuilder::new(*row_count * 2);
-                nulls.append_n(*row_count, true);
-                nulls.append_n(n, false);
-                *self = Self::Nulls(nulls);
-            }
-            Self::NoNulls { row_count } => {
-                *row_count += n;
-            }
-            Self::Nulls(builder) => builder.append_n(n, !is_null),
+        if is_null {
+            self.nulls.append_n_nulls(n);
+        } else {
+            self.nulls.append_n_non_nulls(n);
         }
     }
 
     /// return the number of heap allocated bytes used by this structure to 
store boolean values
     pub fn allocated_size(&self) -> usize {
-        match self {
-            Self::NoNulls { .. } => 0,
-            // BooleanBufferBuilder builder::capacity returns capacity in bits 
(not bytes)
-            Self::Nulls(builder) => builder.capacity() / 8,
-        }
+        // NullBufferBuilder builder::allocated_size returns capacity in bits

Review Comment:
   Nice find



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to