rdblue commented on a change in pull request #25955: [SPARK-29277][SQL] Add early DSv2 filter and projection pushdown URL: https://github.com/apache/spark/pull/25955#discussion_r333747933
########## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala ########## @@ -55,7 +56,52 @@ case class DataSourceV2Relation( } override def computeStats(): Statistics = { - val scan = newScanBuilder().build() + if (Utils.isTesting) { + // when testing, throw an exception if this computeStats method is called because stats should + // not be accessed before pushing the projection and filters to create a scan. otherwise, the + // stats are not accurate because they are based on a full table scan of all columns. + throw new UnsupportedOperationException( + s"BUG: computeStats called before pushdown on DSv2 relation: $name") + } else { + // when not testing, return stats because bad stats are better than failing a query + newScanBuilder() match { + case r: SupportsReportStatistics => + val statistics = r.estimateStatistics() + DataSourceV2Relation.transformV2Stats(statistics, None, conf.defaultSizeInBytes) + case _ => + Statistics(sizeInBytes = conf.defaultSizeInBytes) + } + } + } + + override def newInstance(): DataSourceV2Relation = { + copy(output = output.map(_.newInstance())) + } +} + +/** + * A logical plan for a DSv2 table with a scan already created. + * + * This is used in the optimizer to push filters and projection down before conversion to physical + * plan. This ensures that the stats that are used by the optimizer account for the filters and + * projection that will be pushed down. + * + * @param table a DSv2 [[Table]] + * @param scan a DSv2 [[Scan]] + * @param output the output attributes of this relation + */ +case class DataSourceV2ScanRelation( Review comment: This should be a separate node to avoid correctness problems. Otherwise, it is easy to accidentally write rules that match both `DataSourceV2Relation` and `DataSourceV2ScanRelation` but does not handle the case where operators have already been pushed. When a filter is pushed down, it is also removed from the filters on top of the scan. If push-down happens a second time because rules match the same node, then it is easy for a mistake to ignore the original pushed filter and create a second independent scan. That's a correctness bug that is easy to introduce by accident. Using a separate relation type requires rules to choose whether to support a relation after push-down, or just a relation before push-down. The trade-off is that some places need to match both, but those cases are few and worth the trade. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org