aokolnychyi commented on a change in pull request #32921: URL: https://github.com/apache/spark/pull/32921#discussion_r662433500
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala ########## @@ -17,38 +17,96 @@ package org.apache.spark.sql.execution.datasources.v2 +import com.google.common.base.Objects + +import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan} +import org.apache.spark.sql.catalyst.plans.physical.SinglePartition +import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan, SupportsRuntimeFiltering} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy /** * Physical plan node for scanning a batch of data from a data source v2. */ case class BatchScanExec( output: Seq[AttributeReference], - @transient scan: Scan) extends DataSourceV2ScanExecBase { + @transient scan: Scan, + runtimeFilters: Seq[Expression]) extends DataSourceV2ScanExecBase { @transient lazy val batch = scan.toBatch // TODO: unify the equal/hashCode implementation for all data source v2 query plans. override def equals(other: Any): Boolean = other match { - case other: BatchScanExec => this.batch == other.batch - case _ => false + case other: BatchScanExec => + this.batch == other.batch && this.runtimeFilters == other.runtimeFilters + case _ => + false } - override def hashCode(): Int = batch.hashCode() + override def hashCode(): Int = Objects.hashCode(batch, runtimeFilters) @transient override lazy val partitions: Seq[InputPartition] = batch.planInputPartitions() + @transient private lazy val filteredPartitions: Seq[InputPartition] = { + val dataSourceFilters = runtimeFilters.flatMap { + case DynamicPruningExpression(e) => DataSourceStrategy.translateRuntimeFilter(e) + case _ => None + } + + if (dataSourceFilters.nonEmpty) { + val originalPartitioning = outputPartitioning + + // the cast is safe as runtime filters are only assigned if the scan can be filtered + val filterableScan = scan.asInstanceOf[SupportsRuntimeFiltering] + filterableScan.filter(dataSourceFilters.toArray) + + // call toBatch again to get filtered partitions + val newPartitions = scan.toBatch.planInputPartitions() + + originalPartitioning match { + case p: DataSourcePartitioning if p.numPartitions != newPartitions.size => + throw new SparkException( + "Data source must have preserved the original partitioning during runtime filtering; " + + s"reported num partitions: ${p.numPartitions}, " + + s"num partitions after runtime filtering: ${newPartitions.size}") + case _ => + // no validation is needed as the data source did not report any specific partitioning + } + + newPartitions + } else { + partitions + } + } + override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory() override lazy val inputRDD: RDD[InternalRow] = { - new DataSourceRDD(sparkContext, partitions, readerFactory, supportsColumnar, customMetrics) + if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) { Review comment: We check the number of partitions before and after filtering match only if the source reported a specific partitioning through `SupportsReportPartitioning`. Only in that case we have `DataSourcePartitioning`. This situation, on the other hand, can happen if we inferred `SinglePartition` but the source did not report anything. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org