alamb commented on code in PR #6254:
URL: https://github.com/apache/arrow-datafusion/pull/6254#discussion_r1186500815
##########
datafusion/optimizer/src/push_down_projection.rs:
##########
@@ -466,7 +466,9 @@ fn get_expr(columns: &HashSet<Column>, schema:
&DFSchemaRef) -> Result<Vec<Expr>
}
})
.collect::<Vec<Expr>>();
- if columns.len() != expr.len() {
+ // Because columns may contain VarProvider, so the length of expr may be
less than columns
Review Comment:
I wonder if there is some way to strip out the columns earlier (like for
example, perhaps we could skip pulling out `ScalarVariable` here)
https://github.com/apache/arrow-datafusion/blob/cf233c855f46fad8355342c581ab55f610758b6c/datafusion/expr/src/utils.rs#L273-L275
I tried this locally:
```diff
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 7babd659e..00e1d0769 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -270,13 +270,11 @@ pub fn expr_to_columns(expr: &Expr, accum: &mut
HashSet<Column>) -> Result<()> {
Expr::Column(qc) => {
accum.insert(qc.clone());
}
- Expr::ScalarVariable(_, var_names) => {
- accum.insert(Column::from_name(var_names.join(".")));
- }
// Use explicit pattern match instead of a default
// implementation, so that in the future if someone adds
// new Expr types, they will check here as well
- Expr::Alias(_, _)
+ Expr::ScalarVariable(_, _)
+ | Expr::Alias(_, _)
| Expr::Literal(_)
| Expr::BinaryExpr { .. }
| Expr::Like { .. }
```
and it seems promising
##########
datafusion/core/tests/dataframe.rs:
##########
@@ -1230,3 +1232,36 @@ pub async fn register_alltypes_tiny_pages_parquet(ctx:
&SessionContext) -> Resul
.await?;
Ok(())
}
+
+#[derive(Debug)]
+struct HardcodedIntProvider {}
+
+impl VarProvider for HardcodedIntProvider {
+ fn get_value(&self, _var_names: Vec<String>) -> Result<ScalarValue,
DataFusionError> {
+ Ok(ScalarValue::Int64(Some(1234)))
+ }
+
+ fn get_type(&self, _: &[String]) -> Option<DataType> {
+ Some(DataType::Int64)
+ }
+}
+
+#[tokio::test]
+async fn use_var_provider() -> Result<()> {
Review Comment:
Here is the version I got working:
```rust
#[derive(Debug)]
struct HardcodedIntProvider {}
impl VarProvider for HardcodedIntProvider {
fn get_value(&self, _var_names: Vec<String>) -> Result<ScalarValue,
DataFusionError> {
Ok(ScalarValue::Int64(Some(1234)))
}
fn get_type(&self, _: &[String]) -> Option<DataType> {
Some(DataType::Int64)
}
}
#[tokio::test]
async fn use_var_provider() -> Result<()> {
let schema = Arc::new(Schema::new(vec![
Field::new("foo", DataType::Int64, false),
Field::new("bar", DataType::Int64, false),
]));
let mem_table = Arc::new(MemTable::try_new(schema, vec![])?);
let config = SessionConfig::new()
.with_target_partitions(4)
.set_bool("datafusion.optimizer.skip_failed_rules", false);
let ctx = SessionContext::with_config(config);
ctx.register_table("csv_table", mem_table)?;
ctx.register_variable(VarType::UserDefined,
Arc::new(HardcodedIntProvider {}));
let dataframe = ctx
.sql("SELECT foo FROM csv_table WHERE bar > @var")
.await?;
dataframe.collect().await?;
Ok(())
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]