This is an automated email from the ASF dual-hosted git repository.
jonah pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new ac161bba33 fix: CTEs defined in a subquery can escape their scope
(#10954)
ac161bba33 is described below
commit ac161bba336d098eab46f666af4664de7e8cd29f
Author: Jonah Gao <[email protected]>
AuthorDate: Tue Jun 18 13:04:26 2024 +0800
fix: CTEs defined in a subquery can escape their scope (#10954)
* fix: CTEs defined in a subquery can escape their scope
* Add test
---
datafusion/sql/src/cte.rs | 11 +++--------
datafusion/sql/src/expr/subquery.rs | 6 +++---
datafusion/sql/src/planner.rs | 23 ++++++++++++++--------
datafusion/sql/src/query.rs | 18 ++++++-----------
datafusion/sql/src/relation/join.rs | 31 ++++--------------------------
datafusion/sqllogictest/test_files/cte.slt | 26 ++++++++++++++++++++++++-
6 files changed, 56 insertions(+), 59 deletions(-)
diff --git a/datafusion/sql/src/cte.rs b/datafusion/sql/src/cte.rs
index 4f7b9bb6d1..0035dcda6e 100644
--- a/datafusion/sql/src/cte.rs
+++ b/datafusion/sql/src/cte.rs
@@ -66,10 +66,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
cte_query: Query,
planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
- // CTE expr don't need extend outer_query_schema,
- // so we clone a new planner_context here.
- let mut cte_planner_context = planner_context.clone();
- self.query_to_plan(cte_query, &mut cte_planner_context)
+ self.query_to_plan(cte_query, planner_context)
}
fn recursive_cte(
@@ -113,8 +110,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// allow us to infer the schema to be used in the recursive term.
// ---------- Step 1: Compile the static term ------------------
- let static_plan =
- self.set_expr_to_plan(*left_expr, &mut planner_context.clone())?;
+ let static_plan = self.set_expr_to_plan(*left_expr, planner_context)?;
// Since the recursive CTEs include a component that references a
// table with its name, like the example below:
@@ -166,8 +162,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
// this uses the named_relation we inserted above to resolve the
// relation. This ensures that the recursive term uses the named
relation logical plan
// and thus the 'continuance' physical plan as its input and source
- let recursive_plan =
- self.set_expr_to_plan(*right_expr, &mut planner_context.clone())?;
+ let recursive_plan = self.set_expr_to_plan(*right_expr,
planner_context)?;
// Check if the recursive term references the CTE itself,
// if not, it is a non-recursive CTE
diff --git a/datafusion/sql/src/expr/subquery.rs
b/datafusion/sql/src/expr/subquery.rs
index d34065d92f..ff161c6ed6 100644
--- a/datafusion/sql/src/expr/subquery.rs
+++ b/datafusion/sql/src/expr/subquery.rs
@@ -33,7 +33,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
planner_context: &mut PlannerContext,
) -> Result<Expr> {
let old_outer_query_schema =
- planner_context.set_outer_query_schema(Some(input_schema.clone()));
+
planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
let sub_plan = self.query_to_plan(subquery, planner_context)?;
let outer_ref_columns = sub_plan.all_out_ref_exprs();
planner_context.set_outer_query_schema(old_outer_query_schema);
@@ -55,7 +55,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
planner_context: &mut PlannerContext,
) -> Result<Expr> {
let old_outer_query_schema =
- planner_context.set_outer_query_schema(Some(input_schema.clone()));
+
planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
let sub_plan = self.query_to_plan(subquery, planner_context)?;
let outer_ref_columns = sub_plan.all_out_ref_exprs();
planner_context.set_outer_query_schema(old_outer_query_schema);
@@ -77,7 +77,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
planner_context: &mut PlannerContext,
) -> Result<Expr> {
let old_outer_query_schema =
- planner_context.set_outer_query_schema(Some(input_schema.clone()));
+
planner_context.set_outer_query_schema(Some(input_schema.clone().into()));
let sub_plan = self.query_to_plan(subquery, planner_context)?;
let outer_ref_columns = sub_plan.all_out_ref_exprs();
planner_context.set_outer_query_schema(old_outer_query_schema);
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index a92e64597e..30f95170a3 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -22,7 +22,7 @@ use std::vec;
use arrow_schema::*;
use datafusion_common::{
- field_not_found, internal_err, plan_datafusion_err, SchemaError,
+ field_not_found, internal_err, plan_datafusion_err, DFSchemaRef,
SchemaError,
};
use datafusion_expr::WindowUDF;
use sqlparser::ast::TimezoneInfo;
@@ -139,16 +139,23 @@ impl IdentNormalizer {
/// Common Table Expression (CTE) provided with WITH clause and
/// Parameter Data Types provided with PREPARE statement and the query schema
of the
/// outer query plan
+///
+/// # Cloning
+///
+/// Only the `ctes` are truly cloned when the `PlannerContext` is cloned. This
helps resolve
+/// scoping issues of CTEs. By using cloning, a subquery can inherit CTEs from
the outer query
+/// and can also define its own private CTEs without affecting the outer query.
+///
#[derive(Debug, Clone)]
pub struct PlannerContext {
/// Data types for numbered parameters ($1, $2, etc), if supplied
/// in `PREPARE` statement
- prepare_param_data_types: Vec<DataType>,
+ prepare_param_data_types: Arc<Vec<DataType>>,
/// Map of CTE name to logical plan of the WITH clause.
/// Use `Arc<LogicalPlan>` to allow cheap cloning
ctes: HashMap<String, Arc<LogicalPlan>>,
/// The query schema of the outer query plan, used to resolve the columns
in subquery
- outer_query_schema: Option<DFSchema>,
+ outer_query_schema: Option<DFSchemaRef>,
}
impl Default for PlannerContext {
@@ -161,7 +168,7 @@ impl PlannerContext {
/// Create an empty PlannerContext
pub fn new() -> Self {
Self {
- prepare_param_data_types: vec![],
+ prepare_param_data_types: Arc::new(vec![]),
ctes: HashMap::new(),
outer_query_schema: None,
}
@@ -172,21 +179,21 @@ impl PlannerContext {
mut self,
prepare_param_data_types: Vec<DataType>,
) -> Self {
- self.prepare_param_data_types = prepare_param_data_types;
+ self.prepare_param_data_types = prepare_param_data_types.into();
self
}
// return a reference to the outer queries schema
pub fn outer_query_schema(&self) -> Option<&DFSchema> {
- self.outer_query_schema.as_ref()
+ self.outer_query_schema.as_ref().map(|s| s.as_ref())
}
/// sets the outer query schema, returning the existing one, if
/// any
pub fn set_outer_query_schema(
&mut self,
- mut schema: Option<DFSchema>,
- ) -> Option<DFSchema> {
+ mut schema: Option<DFSchemaRef>,
+ ) -> Option<DFSchemaRef> {
std::mem::swap(&mut self.outer_query_schema, &mut schema);
schema
}
diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs
index fdc739646a..cbbff19321 100644
--- a/datafusion/sql/src/query.rs
+++ b/datafusion/sql/src/query.rs
@@ -29,23 +29,17 @@ use sqlparser::ast::{
};
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
- /// Generate a logical plan from an SQL query
+ /// Generate a logical plan from an SQL query/subquery
pub(crate) fn query_to_plan(
&self,
query: Query,
- planner_context: &mut PlannerContext,
+ outer_planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
- self.query_to_plan_with_schema(query, planner_context)
- }
+ // Each query has its own planner context, including CTEs that are
visible within that query.
+ // It also inherits the CTEs from the outer query by cloning the outer
planner context.
+ let mut query_plan_context = outer_planner_context.clone();
+ let planner_context = &mut query_plan_context;
- /// Generate a logic plan from an SQL query.
- /// It's implementation of `subquery_to_plan` and `query_to_plan`.
- /// It shouldn't be invoked directly.
- fn query_to_plan_with_schema(
- &self,
- query: Query,
- planner_context: &mut PlannerContext,
- ) -> Result<LogicalPlan> {
if let Some(with) = query.with {
self.plan_with_clause(with, planner_context)?;
}
diff --git a/datafusion/sql/src/relation/join.rs
b/datafusion/sql/src/relation/join.rs
index 262bae397c..ee2e35b550 100644
--- a/datafusion/sql/src/relation/join.rs
+++ b/datafusion/sql/src/relation/join.rs
@@ -27,34 +27,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
t: TableWithJoins,
planner_context: &mut PlannerContext,
) -> Result<LogicalPlan> {
- // From clause may exist CTEs, we should separate them from global
CTEs.
- // CTEs in from clause are allowed to be duplicated.
- // Such as `select * from (WITH source AS (select 1 as e) SELECT *
FROM source) t1, (WITH source AS (select 1 as e) SELECT * FROM source) t2;`
which is valid.
- // So always use original global CTEs to plan CTEs in from clause.
- // Btw, don't need to add CTEs in from to global CTEs.
- let origin_planner_context = planner_context.clone();
- let left = self.create_relation(t.relation, planner_context)?;
- match t.joins.len() {
- 0 => {
- *planner_context = origin_planner_context;
- Ok(left)
- }
- _ => {
- let mut joins = t.joins.into_iter();
- *planner_context = origin_planner_context.clone();
- let mut left = self.parse_relation_join(
- left,
- joins.next().unwrap(), // length of joins > 0
- planner_context,
- )?;
- for join in joins {
- *planner_context = origin_planner_context.clone();
- left = self.parse_relation_join(left, join,
planner_context)?;
- }
- *planner_context = origin_planner_context;
- Ok(left)
- }
+ let mut left = self.create_relation(t.relation, planner_context)?;
+ for join in t.joins.into_iter() {
+ left = self.parse_relation_join(left, join, planner_context)?;
}
+ Ok(left)
}
fn parse_relation_join(
diff --git a/datafusion/sqllogictest/test_files/cte.slt
b/datafusion/sqllogictest/test_files/cte.slt
index d8eaa51fc8..e9fcf07e77 100644
--- a/datafusion/sqllogictest/test_files/cte.slt
+++ b/datafusion/sqllogictest/test_files/cte.slt
@@ -834,4 +834,28 @@ SELECT * FROM non_recursive_cte, recursive_cte;
query I
WITH t AS (SELECT * FROM t where t.a < 2) SELECT * FROM t
----
-1
\ No newline at end of file
+1
+
+# Issue: https://github.com/apache/datafusion/issues/10914
+# The CTE defined within the subquery is only visible inside that subquery.
+query I rowsort
+(WITH t AS (SELECT 400) SELECT * FROM t) UNION (SELECT * FROM t);
+----
+1
+2
+3
+400
+
+query error DataFusion error: Error during planning: table
'datafusion\.public\.cte' not found
+(WITH cte AS (SELECT 400) SELECT * FROM cte) UNION (SELECT * FROM cte);
+
+# Test duplicate CTE names in different subqueries in the FROM clause.
+query III rowsort
+SELECT * FROM
+ (WITH t AS (select 400 as e) SELECT * FROM t) t1,
+ (WITH t AS (select 500 as e) SELECT * FROM t) t2,
+ t
+----
+400 500 1
+400 500 2
+400 500 3
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]