alamb commented on code in PR #4736:
URL: https://github.com/apache/arrow-rs/pull/4736#discussion_r1305888339


##########
arrow-row/src/lib.rs:
##########
@@ -1445,6 +1446,70 @@ unsafe fn decode_column(
     Ok(array)
 }
 
+macro_rules! downcast_dict {
+    ($array:ident, $key:ident) => {{
+        $array
+            .as_any()
+            .downcast_ref::<DictionaryArray<$key>>()
+            .unwrap()
+    }};
+}
+
+const LOW_CARDINALITY_THRESHOLD: usize = 10;
+
+#[derive(Debug)]
+pub struct CardinalityAwareRowConverter {
+    inner: RowConverter,
+    done: bool,
+}
+
+impl CardinalityAwareRowConverter {
+    pub fn new(fields: Vec<SortField>) -> Result<Self, ArrowError> {
+        Ok(Self {
+            inner: RowConverter::new(fields)?,
+            done: false,
+        })
+    }
+
+    pub fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    pub fn convert_rows(&self, rows: &Rows) -> Result<Vec<ArrayRef>, 
ArrowError> {
+        self.inner.convert_rows(rows)
+    }
+
+    pub fn convert_columns(
+        &mut self,
+        columns: &[ArrayRef]) -> Result<Rows, ArrowError> {
+        if !self.done {
+            for (i, col) in columns.iter().enumerate() {
+                if let DataType::Dictionary(k, _) = col.data_type() {
+                    // let cardinality = 
col.as_any().downcast_ref::<DictionaryArray<Int32Type>>().unwrap().values().len();
+                    let cardinality = match k.as_ref() {

Review Comment:
   I originally thought that we should base the decision of "is this a high 
cardinality column" on the "in use" cardinality of the dictionary (aka how may 
distinct key values there were -- as suggested on  
https://github.com/apache/arrow-datafusion/issues/7200#issuecomment-1690147397)
   
   However, I now realize that maybe the number of potential key values (aka 
the length of the values array) is actually a more robust predictor of being 
"high cardinality" (as the other values in the dictionary could be used in 
subsequent batches, perhaps)
   
   Do you have any opinion  @tustvold ?



##########
arrow-row/src/lib.rs:
##########
@@ -1445,6 +1446,70 @@ unsafe fn decode_column(
     Ok(array)
 }
 
+macro_rules! downcast_dict {
+    ($array:ident, $key:ident) => {{
+        $array
+            .as_any()
+            .downcast_ref::<DictionaryArray<$key>>()
+            .unwrap()
+    }};
+}
+
+const LOW_CARDINALITY_THRESHOLD: usize = 10;
+
+#[derive(Debug)]
+pub struct CardinalityAwareRowConverter {
+    inner: RowConverter,
+    done: bool,
+}
+
+impl CardinalityAwareRowConverter {
+    pub fn new(fields: Vec<SortField>) -> Result<Self, ArrowError> {
+        Ok(Self {
+            inner: RowConverter::new(fields)?,
+            done: false,
+        })
+    }
+
+    pub fn size(&self) -> usize {
+        self.inner.size()
+    }
+
+    pub fn convert_rows(&self, rows: &Rows) -> Result<Vec<ArrayRef>, 
ArrowError> {
+        self.inner.convert_rows(rows)
+    }
+
+    pub fn convert_columns(
+        &mut self,
+        columns: &[ArrayRef]) -> Result<Rows, ArrowError> {
+        if !self.done {
+            for (i, col) in columns.iter().enumerate() {
+                if let DataType::Dictionary(k, _) = col.data_type() {
+                    // let cardinality = 
col.as_any().downcast_ref::<DictionaryArray<Int32Type>>().unwrap().values().len();
+                    let cardinality = match k.as_ref() {
+                        DataType::Int8 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int16 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int32 => downcast_dict!(col, 
Int32Type).values().len(),
+                        DataType::Int64 => downcast_dict!(col, 
Int64Type).values().len(),
+                        DataType::UInt16 => downcast_dict!(col, 
UInt16Type).values().len(),
+                        DataType::UInt32 => downcast_dict!(col, 
UInt32Type).values().len(),
+                        DataType::UInt64 => downcast_dict!(col, 
UInt64Type).values().len(),
+                        _ => unreachable!(),
+                    };
+
+                    if cardinality >= LOW_CARDINALITY_THRESHOLD {

Review Comment:
   this will effectively switch the encoding mid-steam, I think -- which will 
mean that the output can't be compared with previously created rows, which is 
not correct. 
   
   I think the decision has to be made based on the first batch and then that 
decision used for encoding all rows



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to