geoffreyclaude commented on code in PR #17843: URL: https://github.com/apache/datafusion/pull/17843#discussion_r2593073788
########## datafusion-examples/examples/relation_planner/table_sample.rs: ########## @@ -0,0 +1,969 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This example demonstrates using custom relation planners to implement +//! SQL TABLESAMPLE clause support. +//! +//! TABLESAMPLE allows sampling a fraction or number of rows from a table: +//! - `SELECT * FROM table TABLESAMPLE BERNOULLI(10)` - 10% sample +//! - `SELECT * FROM table TABLESAMPLE (100 ROWS)` - 100 rows +//! - `SELECT * FROM table TABLESAMPLE (10 PERCENT) REPEATABLE(42)` - Reproducible + +use std::{ + any::Any, + fmt::{self, Debug, Formatter}, + hash::{Hash, Hasher}, + ops::{Add, Div, Mul, Sub}, + pin::Pin, + str::FromStr, + sync::Arc, + task::{Context, Poll}, +}; + +use arrow::{ + array::{ArrayRef, Int32Array, RecordBatch, StringArray, UInt32Array}, + compute, +}; +use arrow_schema::SchemaRef; +use futures::{ + ready, + stream::{Stream, StreamExt}, +}; +use rand::{rngs::StdRng, Rng, SeedableRng}; +use rand_distr::{Distribution, Poisson}; +use tonic::async_trait; + +use datafusion::{ + execution::{ + context::QueryPlanner, RecordBatchStream, SendableRecordBatchStream, + SessionState, SessionStateBuilder, TaskContext, + }, + physical_expr::EquivalenceProperties, + physical_plan::{ + metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet, RecordOutput}, + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, + }, + physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}, + prelude::*, +}; +use datafusion_common::{ + internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchemaRef, + DataFusionError, Result, Statistics, +}; +use datafusion_expr::{ + logical_plan::{Extension, LogicalPlan, LogicalPlanBuilder}, + planner::{ + PlannedRelation, RelationPlanner, RelationPlannerContext, RelationPlanning, + }, + UserDefinedLogicalNode, UserDefinedLogicalNodeCore, +}; +use datafusion_sql::sqlparser::ast::{ + self, TableFactor, TableSampleMethod, TableSampleUnit, +}; + +/// This example demonstrates using custom relation planners to implement +/// SQL TABLESAMPLE clause support. +pub async fn table_sample() -> Result<()> { + let state = SessionStateBuilder::new() + .with_default_features() + .with_query_planner(Arc::new(TableSampleQueryPlanner {})) + .build(); + + let ctx = SessionContext::new_with_state(state.clone()); + + // Register sample data table + register_sample_data(&ctx)?; + + // Register custom planner + ctx.register_relation_planner(Arc::new(TableSamplePlanner))?; + + println!("Custom Relation Planner: TABLESAMPLE Support"); + println!("============================================\n"); + println!("Note: This shows logical planning for TABLESAMPLE."); + println!("Physical execution requires additional implementation.\n"); + + // Example 1: Full table without any sampling (baseline) + // Shows: Complete dataset with all 10 rows (1-10 with row_1 to row_10) + // Expected: 10 rows showing the full sample_data table + // Actual: + // +---------+---------+ Review Comment: Updated with `insta::assert_snapshot`, because I'm lazy and want auto-complete 😄 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
