Re: [PR] Add support for recursive CTEs [arrow-datafusion]

via GitHub Tue, 09 Jan 2024 14:35:41 -0800


alamb commented on code in PR #7581:
URL: https://github.com/apache/arrow-datafusion/pull/7581#discussion_r1446669313



##########
datafusion/expr/src/logical_plan/builder.rs:
##########
@@ -121,6 +123,39 @@ impl LogicalPlanBuilder {
         }))
     }
 
+    /// A named temporary relation with a schema.
+    ///
+    /// This is used to represent a relation that does not exist at the

Review Comment:
   perhaps it is worth mentioning that this is to implement CTEs?



##########
datafusion/physical-plan/src/recursive_query.rs:
##########
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the recursive query plan
+
+use std::any::Any;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+use futures::{Stream, StreamExt};
+use tokio::sync::mpsc;
+
+use super::expressions::PhysicalSortExpr;
+use super::metrics::BaselineMetrics;
+use super::RecordBatchStream;
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    SendableRecordBatchStream, Statistics,
+};
+use arrow::error::ArrowError;
+use tokio::sync::mpsc::{Receiver, Sender};
+
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+/// Recursive query execution plan.
+///
+/// This plan has two components: a base part (the static term) and
+/// a dynamic part (the recursive term). The execution will start from
+/// the base, and as long as the previous iteration produced at least
+/// a single new row (taking care of the distinction) the recursive
+/// part will be continuously executed.
+///
+/// Before each execution of the dynamic part, the rows from the previous
+/// iteration will be available in a "working table" (not a real table,
+/// can be only accessed using a continuance operation).
+///
+/// Note that there won't be any limit or checks applied to detect
+/// an infinite recursion, so it is up to the planner to ensure that
+/// it won't happen.
+#[derive(Debug)]
+pub struct RecursiveQueryExec {
+    /// Name of the query handler
+    name: String,
+    /// The base part (static term)
+    static_term: Arc<dyn ExecutionPlan>,
+    /// The dynamic part (recursive term)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// Distinction
+    is_distinct: bool,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl RecursiveQueryExec {
+    /// Create a new RecursiveQueryExec
+    pub fn new(
+        name: String,
+        static_term: Arc<dyn ExecutionPlan>,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        is_distinct: bool,
+    ) -> Self {
+        RecursiveQueryExec {
+            name,
+            static_term,
+            recursive_term,
+            is_distinct,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl ExecutionPlan for RecursiveQueryExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.static_term.schema()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.static_term.clone(), self.recursive_term.clone()]
+    }
+
+    // Distribution on a recursive query is really tricky to handle.
+    // For now, we are going to use a single partition but in the
+    // future we might find a better way to handle this.
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    // TODO: control these hints and see whether we can
+    // infer some from the child plans (static/recurisve terms).
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn required_input_distribution(&self) -> 
Vec<datafusion_physical_expr::Distribution> {
+        vec![
+            datafusion_physical_expr::Distribution::SinglePartition,
+            datafusion_physical_expr::Distribution::SinglePartition,
+        ]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(RecursiveQueryExec::new(
+            self.name.clone(),
+            children[0].clone(),
+            children[1].clone(),
+            self.is_distinct,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // TODO: we might be able to handle multiple partitions in the future.
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "RecursiveQueryExec got an invalid partition {} (expected 0)",
+                partition
+            )));
+        }
+
+        let static_stream = self.static_term.execute(partition, 
context.clone())?;
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        Ok(Box::pin(RecursiveQueryStream::new(
+            context,
+            self.name.clone(),
+            self.recursive_term.clone(),
+            static_stream,
+            baseline_metrics,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for RecursiveQueryExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "RecursiveQueryExec: is_distinct={}", 
self.is_distinct)
+            }
+        }
+    }
+}
+
+/// The actual logic of the recursive queries happens during the streaming
+/// process. A simplified version of the algorithm is the following:
+///
+/// buffer = []
+///
+/// while batch := static_stream.next():
+///    buffer.push(batch)
+///    yield buffer
+///
+/// while buffer.len() > 0:
+///    sender, receiver = Channel()
+///    register_continuation(handle_name, receiver)
+///    sender.send(buffer.drain())
+///    recursive_stream = recursive_term.execute()
+///    while batch := recursive_stream.next():
+///        buffer.append(batch)
+///        yield buffer
+///
+struct RecursiveQueryStream {
+    /// The context to be used for managing handlers & executing new tasks
+    task_context: Arc<TaskContext>,
+    /// Name of the relation handler to be used by the recursive term
+    name: String,
+    /// The dynamic part (recursive term) as is (without being executed)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// The static part (static term) as a stream. If the processing of this
+    /// part is completed, then it will be None.
+    static_stream: Option<SendableRecordBatchStream>,
+    /// The dynamic part (recursive term) as a stream. If the processing of 
this
+    /// part has not started yet, or has been completed, then it will be None.
+    recursive_stream: Option<SendableRecordBatchStream>,
+    /// The schema of the output.
+    schema: SchemaRef,
+    /// In-memory buffer for storing a copy of the current results. Will be
+    /// cleared after each iteration.
+    buffer: Vec<RecordBatch>,

Review Comment:
   👍 this buffers the intermediate relation which makes sense



##########
datafusion/expr/src/logical_plan/plan.rs:
##########
@@ -112,6 +112,8 @@ pub enum LogicalPlan {
     /// produces 0 or 1 row. This is used to implement SQL `SELECT`
     /// that has no values in the `FROM` clause.
     EmptyRelation(EmptyRelation),
+    /// A named temporary relation with a schema.
+    NamedRelation(NamedRelation),

Review Comment:
   One rationale might be to make the implementation simpler -- if we could 
implement the recursive relation as a table provider, it would likely allow the 
changes to be more localized / smaller (e.g. maybe we could reuse 
[`MemTable::load`](https://docs.rs/datafusion/latest/datafusion/datasource/memory/struct.MemTable.html)
 to update the batches on each iteration)
   



##########
datafusion/physical-plan/src/recursive_query.rs:
##########
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the recursive query plan
+
+use std::any::Any;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+use futures::{Stream, StreamExt};
+use tokio::sync::mpsc;
+
+use super::expressions::PhysicalSortExpr;
+use super::metrics::BaselineMetrics;
+use super::RecordBatchStream;
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    SendableRecordBatchStream, Statistics,
+};
+use arrow::error::ArrowError;
+use tokio::sync::mpsc::{Receiver, Sender};
+
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+/// Recursive query execution plan.
+///
+/// This plan has two components: a base part (the static term) and
+/// a dynamic part (the recursive term). The execution will start from
+/// the base, and as long as the previous iteration produced at least
+/// a single new row (taking care of the distinction) the recursive
+/// part will be continuously executed.
+///
+/// Before each execution of the dynamic part, the rows from the previous
+/// iteration will be available in a "working table" (not a real table,
+/// can be only accessed using a continuance operation).
+///
+/// Note that there won't be any limit or checks applied to detect
+/// an infinite recursion, so it is up to the planner to ensure that
+/// it won't happen.
+#[derive(Debug)]
+pub struct RecursiveQueryExec {
+    /// Name of the query handler
+    name: String,
+    /// The base part (static term)
+    static_term: Arc<dyn ExecutionPlan>,
+    /// The dynamic part (recursive term)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// Distinction
+    is_distinct: bool,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl RecursiveQueryExec {
+    /// Create a new RecursiveQueryExec
+    pub fn new(
+        name: String,
+        static_term: Arc<dyn ExecutionPlan>,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        is_distinct: bool,
+    ) -> Self {
+        RecursiveQueryExec {
+            name,
+            static_term,
+            recursive_term,
+            is_distinct,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl ExecutionPlan for RecursiveQueryExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.static_term.schema()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.static_term.clone(), self.recursive_term.clone()]
+    }
+
+    // Distribution on a recursive query is really tricky to handle.
+    // For now, we are going to use a single partition but in the
+    // future we might find a better way to handle this.
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    // TODO: control these hints and see whether we can
+    // infer some from the child plans (static/recurisve terms).
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn required_input_distribution(&self) -> 
Vec<datafusion_physical_expr::Distribution> {
+        vec![
+            datafusion_physical_expr::Distribution::SinglePartition,
+            datafusion_physical_expr::Distribution::SinglePartition,
+        ]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(RecursiveQueryExec::new(
+            self.name.clone(),
+            children[0].clone(),
+            children[1].clone(),
+            self.is_distinct,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // TODO: we might be able to handle multiple partitions in the future.
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "RecursiveQueryExec got an invalid partition {} (expected 0)",
+                partition
+            )));
+        }
+
+        let static_stream = self.static_term.execute(partition, 
context.clone())?;
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        Ok(Box::pin(RecursiveQueryStream::new(
+            context,
+            self.name.clone(),
+            self.recursive_term.clone(),
+            static_stream,
+            baseline_metrics,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for RecursiveQueryExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "RecursiveQueryExec: is_distinct={}", 
self.is_distinct)
+            }
+        }
+    }
+}
+
+/// The actual logic of the recursive queries happens during the streaming
+/// process. A simplified version of the algorithm is the following:
+///
+/// buffer = []
+///
+/// while batch := static_stream.next():
+///    buffer.push(batch)
+///    yield buffer
+///
+/// while buffer.len() > 0:
+///    sender, receiver = Channel()
+///    register_continuation(handle_name, receiver)
+///    sender.send(buffer.drain())
+///    recursive_stream = recursive_term.execute()
+///    while batch := recursive_stream.next():
+///        buffer.append(batch)
+///        yield buffer
+///
+struct RecursiveQueryStream {
+    /// The context to be used for managing handlers & executing new tasks
+    task_context: Arc<TaskContext>,
+    /// Name of the relation handler to be used by the recursive term
+    name: String,
+    /// The dynamic part (recursive term) as is (without being executed)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// The static part (static term) as a stream. If the processing of this
+    /// part is completed, then it will be None.
+    static_stream: Option<SendableRecordBatchStream>,
+    /// The dynamic part (recursive term) as a stream. If the processing of 
this
+    /// part has not started yet, or has been completed, then it will be None.
+    recursive_stream: Option<SendableRecordBatchStream>,
+    /// The schema of the output.
+    schema: SchemaRef,
+    /// In-memory buffer for storing a copy of the current results. Will be
+    /// cleared after each iteration.
+    buffer: Vec<RecordBatch>,
+    // /// Metrics.
+    _baseline_metrics: BaselineMetrics,
+}
+
+impl RecursiveQueryStream {
+    /// Create a new recursive query stream
+    fn new(
+        task_context: Arc<TaskContext>,
+        name: String,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        static_stream: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Self {
+        let schema = static_stream.schema();
+        Self {
+            task_context,
+            name,
+            recursive_term,
+            static_stream: Some(static_stream),
+            recursive_stream: None,
+            schema,
+            buffer: vec![],
+            _baseline_metrics: baseline_metrics,
+        }
+    }
+
+    /// Push a clone of the given batch to the in memory buffer, and then 
return
+    /// a poll with it.
+    fn push_batch(
+        mut self: std::pin::Pin<&mut Self>,
+        batch: RecordBatch,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        self.buffer.push(batch.clone());
+        Poll::Ready(Some(Ok(batch)))
+    }
+
+    /// Start polling for the next iteration, will be called either after the 
static term
+    /// is completed or another term is completed. It will follow the 
algorithm above on
+    /// to check whether the recursion has ended.
+    fn poll_next_iteration(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let total_length = self
+            .buffer
+            .iter()
+            .fold(0, |acc, batch| acc + batch.num_rows());
+
+        if total_length == 0 {
+            return Poll::Ready(None);
+        }
+
+        // The initial capacity of the channels is the same as the number of 
partitions
+        // we currently hold in the buffer.
+        let (sender, receiver): (
+            Sender<Result<RecordBatch>>,
+            Receiver<Result<RecordBatch>>,
+        ) = mpsc::channel(self.buffer.len() + 1);
+
+        // There shouldn't be any handlers with this name, since the execution 
of recursive
+        // term will immediately consume the relation handler.
+        self.task_context
+            .push_relation_handler(self.name.clone(), receiver)?;
+
+        // This part heavily assumes that the buffer is not going to change. 
Maybe we
+        // should use a mutex?
+        for batch in self.buffer.drain(..) {
+            match sender.try_send(Ok(batch.clone())) {
+                Ok(_) => {}
+                Err(e) => {
+                    return Poll::Ready(Some(Err(DataFusionError::ArrowError(
+                        ArrowError::from_external_error(Box::new(e)),
+                        None,
+                    ))));
+                }
+            }
+        }
+
+        // We always execute (and re-execute iteratively) the first partition.
+        // Downstream plans should not expect any partitioning.
+        let partition = 0;
+
+        self.recursive_stream = Some(

Review Comment:
   Since this calls `execute` again recursively, if we used a `TableProvider` 
the underlying `TableProvider::execute` would be called again too



##########
datafusion/sqllogictest/test_files/cte.slt:
##########
@@ -19,3 +19,195 @@ query II
 select * from (WITH source AS (select 1 as e) SELECT * FROM source) t1,   
(WITH source AS (select 1 as e) SELECT * FROM source) t2
 ----
 1 1
+

Review Comment:
   Could you also add some tests that do:
   
   1. EXPLAIN explain plans of these queries to show how they work
   2. Tests that produce more than one batch on each iteration (these tests 
make batches much smaller than the default 8K rows). Maybe by setting 
`target_batch_size` to `3` or something.
   3. A query with some more stateful exeution (e.g. a GROUP BY)



##########
datafusion/physical-plan/src/continuance.rs:
##########
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the continuance query plan
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+use tokio_stream::wrappers::ReceiverStream;
+
+use crate::stream::RecordBatchStreamAdapter;
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+use super::expressions::PhysicalSortExpr;
+
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    SendableRecordBatchStream, Statistics,
+};
+use datafusion_common::{DataFusionError, Result};
+
+/// A temporary "working table" operation where the input data will be
+/// taken from the named handle during the execution and will be re-published
+/// as is (kind of like a mirror).
+///
+/// Most notably used in the implementation of recursive queries where the
+/// underlying relation does not exist yet but the data will come as the 
previous
+/// term is evaluated. This table will be used such that the recursive plan
+/// will register a receiver in the task context and this plan will use that
+/// receiver to get the data and stream it back up so that the batches are 
available
+/// in the next iteration.
+#[derive(Debug)]
+pub struct ContinuanceExec {
+    /// Name of the relation handler
+    name: String,
+    /// The schema of the stream
+    schema: SchemaRef,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl ContinuanceExec {
+    /// Create a new execution plan for a continuance stream. The given 
relation
+    /// handler must exist in the task context before calling 
[`ContinuanceExec::execute`] on this
+    /// plan.
+    pub fn new(name: String, schema: SchemaRef) -> Self {
+        Self {
+            name,
+            schema,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl DisplayAs for ContinuanceExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "ContinuanceExec: name={}", self.name)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for ContinuanceExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(ContinuanceExec::new(
+            self.name.clone(),
+            self.schema.clone(),
+        )))
+    }
+
+    /// This plan does not come with any special streams, but rather we use
+    /// the existing [`RecordBatchStreamAdapter`] to receive the data from
+    /// the registered handle.
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // Continuance streams must be the plan base.
+        if partition != 0 {

Review Comment:
   using a table provider / MemTable could also allow this to provide multiple 
partitions and thus improve parallelism



##########
datafusion/expr/src/logical_plan/plan.rs:
##########
@@ -112,6 +112,8 @@ pub enum LogicalPlan {
     /// produces 0 or 1 row. This is used to implement SQL `SELECT`
     /// that has no values in the `FROM` clause.
     EmptyRelation(EmptyRelation),
+    /// A named temporary relation with a schema.
+    NamedRelation(NamedRelation),

Review Comment:
   Basically I understand the need to have `LogicalPlan::RecursiveQuery` but I 
don't (yet) understand the need to have the `NamedRelation`



##########
datafusion/physical-plan/src/recursive_query.rs:
##########
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the recursive query plan
+
+use std::any::Any;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+use futures::{Stream, StreamExt};
+use tokio::sync::mpsc;
+
+use super::expressions::PhysicalSortExpr;
+use super::metrics::BaselineMetrics;
+use super::RecordBatchStream;
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    SendableRecordBatchStream, Statistics,
+};
+use arrow::error::ArrowError;
+use tokio::sync::mpsc::{Receiver, Sender};
+
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+/// Recursive query execution plan.
+///
+/// This plan has two components: a base part (the static term) and
+/// a dynamic part (the recursive term). The execution will start from
+/// the base, and as long as the previous iteration produced at least
+/// a single new row (taking care of the distinction) the recursive
+/// part will be continuously executed.
+///
+/// Before each execution of the dynamic part, the rows from the previous
+/// iteration will be available in a "working table" (not a real table,
+/// can be only accessed using a continuance operation).
+///
+/// Note that there won't be any limit or checks applied to detect
+/// an infinite recursion, so it is up to the planner to ensure that
+/// it won't happen.
+#[derive(Debug)]
+pub struct RecursiveQueryExec {
+    /// Name of the query handler
+    name: String,
+    /// The base part (static term)
+    static_term: Arc<dyn ExecutionPlan>,
+    /// The dynamic part (recursive term)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// Distinction
+    is_distinct: bool,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl RecursiveQueryExec {
+    /// Create a new RecursiveQueryExec
+    pub fn new(
+        name: String,
+        static_term: Arc<dyn ExecutionPlan>,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        is_distinct: bool,
+    ) -> Self {
+        RecursiveQueryExec {
+            name,
+            static_term,
+            recursive_term,
+            is_distinct,
+            metrics: ExecutionPlanMetricsSet::new(),
+        }
+    }
+}
+
+impl ExecutionPlan for RecursiveQueryExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.static_term.schema()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.static_term.clone(), self.recursive_term.clone()]
+    }
+
+    // Distribution on a recursive query is really tricky to handle.
+    // For now, we are going to use a single partition but in the
+    // future we might find a better way to handle this.
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    // TODO: control these hints and see whether we can
+    // infer some from the child plans (static/recurisve terms).
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn required_input_distribution(&self) -> 
Vec<datafusion_physical_expr::Distribution> {
+        vec![
+            datafusion_physical_expr::Distribution::SinglePartition,
+            datafusion_physical_expr::Distribution::SinglePartition,
+        ]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(RecursiveQueryExec::new(
+            self.name.clone(),
+            children[0].clone(),
+            children[1].clone(),
+            self.is_distinct,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // TODO: we might be able to handle multiple partitions in the future.
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "RecursiveQueryExec got an invalid partition {} (expected 0)",
+                partition
+            )));
+        }
+
+        let static_stream = self.static_term.execute(partition, 
context.clone())?;
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        Ok(Box::pin(RecursiveQueryStream::new(
+            context,
+            self.name.clone(),
+            self.recursive_term.clone(),
+            static_stream,
+            baseline_metrics,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for RecursiveQueryExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "RecursiveQueryExec: is_distinct={}", 
self.is_distinct)
+            }
+        }
+    }
+}
+
+/// The actual logic of the recursive queries happens during the streaming
+/// process. A simplified version of the algorithm is the following:
+///
+/// buffer = []
+///
+/// while batch := static_stream.next():
+///    buffer.push(batch)
+///    yield buffer
+///
+/// while buffer.len() > 0:
+///    sender, receiver = Channel()
+///    register_continuation(handle_name, receiver)
+///    sender.send(buffer.drain())
+///    recursive_stream = recursive_term.execute()
+///    while batch := recursive_stream.next():
+///        buffer.append(batch)
+///        yield buffer
+///
+struct RecursiveQueryStream {
+    /// The context to be used for managing handlers & executing new tasks
+    task_context: Arc<TaskContext>,
+    /// Name of the relation handler to be used by the recursive term
+    name: String,
+    /// The dynamic part (recursive term) as is (without being executed)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// The static part (static term) as a stream. If the processing of this
+    /// part is completed, then it will be None.
+    static_stream: Option<SendableRecordBatchStream>,
+    /// The dynamic part (recursive term) as a stream. If the processing of 
this
+    /// part has not started yet, or has been completed, then it will be None.
+    recursive_stream: Option<SendableRecordBatchStream>,
+    /// The schema of the output.
+    schema: SchemaRef,
+    /// In-memory buffer for storing a copy of the current results. Will be
+    /// cleared after each iteration.
+    buffer: Vec<RecordBatch>,
+    // /// Metrics.
+    _baseline_metrics: BaselineMetrics,
+}
+
+impl RecursiveQueryStream {
+    /// Create a new recursive query stream
+    fn new(
+        task_context: Arc<TaskContext>,
+        name: String,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        static_stream: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Self {
+        let schema = static_stream.schema();
+        Self {
+            task_context,
+            name,
+            recursive_term,
+            static_stream: Some(static_stream),
+            recursive_stream: None,
+            schema,
+            buffer: vec![],
+            _baseline_metrics: baseline_metrics,
+        }
+    }
+
+    /// Push a clone of the given batch to the in memory buffer, and then 
return
+    /// a poll with it.
+    fn push_batch(
+        mut self: std::pin::Pin<&mut Self>,
+        batch: RecordBatch,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        self.buffer.push(batch.clone());
+        Poll::Ready(Some(Ok(batch)))
+    }
+
+    /// Start polling for the next iteration, will be called either after the 
static term
+    /// is completed or another term is completed. It will follow the 
algorithm above on
+    /// to check whether the recursion has ended.
+    fn poll_next_iteration(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let total_length = self
+            .buffer
+            .iter()
+            .fold(0, |acc, batch| acc + batch.num_rows());
+
+        if total_length == 0 {
+            return Poll::Ready(None);
+        }
+
+        // The initial capacity of the channels is the same as the number of 
partitions
+        // we currently hold in the buffer.
+        let (sender, receiver): (
+            Sender<Result<RecordBatch>>,
+            Receiver<Result<RecordBatch>>,
+        ) = mpsc::channel(self.buffer.len() + 1);
+
+        // There shouldn't be any handlers with this name, since the execution 
of recursive
+        // term will immediately consume the relation handler.
+        self.task_context
+            .push_relation_handler(self.name.clone(), receiver)?;
+
+        // This part heavily assumes that the buffer is not going to change. 
Maybe we
+        // should use a mutex?
+        for batch in self.buffer.drain(..) {
+            match sender.try_send(Ok(batch.clone())) {
+                Ok(_) => {}
+                Err(e) => {
+                    return Poll::Ready(Some(Err(DataFusionError::ArrowError(
+                        ArrowError::from_external_error(Box::new(e)),
+                        None,
+                    ))));
+                }
+            }
+        }
+
+        // We always execute (and re-execute iteratively) the first partition.
+        // Downstream plans should not expect any partitioning.
+        let partition = 0;
+
+        self.recursive_stream = Some(
+            self.recursive_term
+                .execute(partition, self.task_context.clone())?,
+        );
+        self.poll_next(cx)
+    }
+}
+
+impl Stream for RecursiveQueryStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        // TODO: we should use this poll to record some metrics!
+        if let Some(static_stream) = &mut self.static_stream {
+            // While the static term's stream is available, we'll be 
forwarding the batches from it (also
+            // saving them for the initial iteration of the recursive term).
+            let poll = static_stream.poll_next_unpin(cx);
+            match &poll {
+                Poll::Ready(None) => {
+                    // Once this is done, we can start running the setup for 
the recursive term.
+                    self.static_stream = None;
+                    self.poll_next_iteration(cx)
+                }
+                Poll::Ready(Some(Ok(batch))) => self.push_batch(batch.clone()),
+                _ => poll,
+            }
+        } else if let Some(recursive_stream) = &mut self.recursive_stream {
+            let poll = recursive_stream.poll_next_unpin(cx);

Review Comment:
   FYI you can might be able to use the `ready!` macro to simplify some of this 
matching (it returns `Poll::Pending` if the poll is pending)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] Add support for recursive CTEs [arrow-datafusion]

Reply via email to