This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new f55c760 [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning f55c760 is described below commit f55c760df651e82c5f72038895b5989ab16e22b2 Author: Liang-Chi Hsieh <vii...@gmail.com> AuthorDate: Wed Mar 13 20:12:01 2019 +0900 [SPARK-27034][SQL][FOLLOWUP] Rename ParquetSchemaPruning to SchemaPruning ## What changes were proposed in this pull request? This is a followup to #23943. This proposes to rename ParquetSchemaPruning to SchemaPruning as ParquetSchemaPruning supports both Parquet and ORC v1 now. ## How was this patch tested? Existing tests. Closes #24077 from viirya/nested-schema-pruning-orc-followup. Authored-by: Liang-Chi Hsieh <vii...@gmail.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- .../apache/spark/sql/execution/SparkOptimizer.scala | 4 ++-- ...arquetSchemaPruning.scala => SchemaPruning.scala} | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 6c6d344..31540e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions -import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaPruning +import org.apache.spark.sql.execution.datasources.SchemaPruning import org.apache.spark.sql.execution.python.{ExtractPythonUDFFromAggregate, ExtractPythonUDFs} class SparkOptimizer( @@ -34,7 +34,7 @@ class SparkOptimizer( Batch("Extract Python UDFs", Once, Seq(ExtractPythonUDFFromAggregate, ExtractPythonUDFs): _*) :+ Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+ - Batch("Parquet Schema Pruning", Once, ParquetSchemaPruning)) ++ + Batch("Schema Pruning", Once, SchemaPruning)) ++ postHocOptimizationBatches :+ Batch("User Provided Optimizers", fixedPoint, experimentalMethods.extraOptimizations: _*) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala similarity index 90% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala index 47551a5..3a37ca7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala @@ -15,24 +15,24 @@ * limitations under the License. */ -package org.apache.spark.sql.execution.datasources.parquet +package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType} /** - * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a - * [[ParquetRelation]]. By "Parquet column", we mean a column as defined in the - * Parquet format. In Spark SQL, a root-level Parquet column corresponds to a - * SQL column, and a nested Parquet column corresponds to a [[StructField]]. + * Prunes unnecessary physical columns given a [[PhysicalOperation]] over a data source relation. + * By "physical column", we mean a column as defined in the data source format like Parquet format + * or ORC format. For example, in Spark SQL, a root-level Parquet column corresponds to a SQL + * column, and a nested Parquet column corresponds to a [[StructField]]. */ -object ParquetSchemaPruning extends Rule[LogicalPlan] { +object SchemaPruning extends Rule[LogicalPlan] { import org.apache.spark.sql.catalyst.expressions.SchemaPruning._ override def apply(plan: LogicalPlan): LogicalPlan = @@ -62,10 +62,10 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] { // each schemata, assuming the fields in prunedDataSchema are a subset of the fields // in dataSchema. if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) { - val prunedParquetRelation = + val prunedHadoopRelation = hadoopFsRelation.copy(dataSchema = prunedDataSchema)(hadoopFsRelation.sparkSession) - val prunedRelation = buildPrunedRelation(l, prunedParquetRelation) + val prunedRelation = buildPrunedRelation(l, prunedHadoopRelation) val projectionOverSchema = ProjectionOverSchema(prunedDataSchema) buildNewProjection(normalizedProjects, normalizedFilters, prunedRelation, @@ -79,7 +79,7 @@ object ParquetSchemaPruning extends Rule[LogicalPlan] { } /** - * Checks to see if the given relation is Parquet and can be pruned. + * Checks to see if the given relation can be pruned. Currently we support Parquet and ORC v1. */ private def canPruneRelation(fsRelation: HadoopFsRelation) = fsRelation.fileFormat.isInstanceOf[ParquetFileFormat] || --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org