Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/20387#discussion_r166526321 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala --- @@ -17,17 +17,139 @@ package org.apache.spark.sql.execution.datasources.v2 +import java.util.UUID + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.sql.{AnalysisException, SaveMode} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation -import org.apache.spark.sql.catalyst.expressions.AttributeReference -import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} -import org.apache.spark.sql.sources.v2.reader._ +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression} +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.sources.{DataSourceRegister, Filter} +import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema, WriteSupport} +import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, SupportsPushDownCatalystFilters, SupportsPushDownFilters, SupportsPushDownRequiredColumns, SupportsReportStatistics} +import org.apache.spark.sql.sources.v2.writer.DataSourceWriter +import org.apache.spark.sql.types.StructType case class DataSourceV2Relation( - fullOutput: Seq[AttributeReference], - reader: DataSourceReader) - extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder { + source: DataSourceV2, + options: Map[String, String], + path: Option[String] = None, --- End diff -- Sorry I might not make it clear. What I proposed is to only include changes that are necessary for this PR. To make the plan immutable, what we need is: 1. keep required columns and pushed filters in the plan. So here we add 2 parameters: `projection` and `filters`. 2. keep things that are needed to create a `DataSourceReader`, i.e. `DataSourceV2`, `DataSourceOption` and `userSpecifiedSchema`. For 1 we are good. For 2, I think the most intuitive and surgical way is to create the `DataSourceOption` in `DataFrameReader`(the current behavior) and keep the `DataSourceOption` in `DataSourceV2Relation`. There may be an argument about putting these common information in the `simpleString`, I'm +1 on doing this, but this should be done after https://issues.apache.org/jira/browse/SPARK-23341 when the community have an agreement on an initial list of common information(may be more than just table and path). I really like your proposal about "donât mix partial, unrelated changes into a commit", and I hope all the open source community can follow it and make high quality PRs. Thanks!
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org