[GitHub] spark pull request #21889: [SPARK-4502][SQL] Parquet nested column pruning -...

ajacques Mon, 13 Aug 2018 22:15:02 -0700

Github user ajacques commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21889#discussion_r209830673
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala
 ---
    @@ -0,0 +1,200 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.execution.datasources.parquet
    +
    +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, 
Expression, NamedExpression}
    +import org.apache.spark.sql.catalyst.planning.PhysicalOperation
    +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, 
Project}
    +import org.apache.spark.sql.catalyst.rules.Rule
    +import org.apache.spark.sql.execution.{ProjectionOverSchema, SelectedField}
    +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, 
LogicalRelation}
    +import org.apache.spark.sql.internal.SQLConf
    +import org.apache.spark.sql.types.{ArrayType, DataType, MapType, 
StructField, StructType}
    +
    +/**
    + * Prunes unnecessary Parquet columns given a [[PhysicalOperation]] over a
    + * [[ParquetRelation]]. By "Parquet column", we mean a column as defined 
in the
    + * Parquet format. In Spark SQL, a root-level Parquet column corresponds 
to a
    + * SQL column, and a nested Parquet column corresponds to a 
[[StructField]].
    + */
    +private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
    +  override def apply(plan: LogicalPlan): LogicalPlan =
    +    if (SQLConf.get.nestedSchemaPruningEnabled) {
    +      apply0(plan)
    +    } else {
    +      plan
    +    }
    +
    +  private def apply0(plan: LogicalPlan): LogicalPlan =
    +    plan transformDown {
    +      case op @ PhysicalOperation(projects, filters,
    +          l @ LogicalRelation(hadoopFsRelation @ HadoopFsRelation(_, _,
    +            dataSchema, _, _: ParquetFileFormat, _), _, _, _)) =>
    +        val projectionRootFields = projects.flatMap(getRootFields)
    +        val filterRootFields = filters.flatMap(getRootFields)
    +        val requestedRootFields = (projectionRootFields ++ 
filterRootFields).distinct
    +
    +        // If requestedRootFields includes a nested field, continue. 
Otherwise,
    +        // return op
    +        if (requestedRootFields.exists { case RootField(_, derivedFromAtt) 
=> !derivedFromAtt }) {
    +          val prunedDataSchema = buildPrunedDataSchema(dataSchema, 
requestedRootFields)
    +
    +          // If the data schema is different from the pruned data schema, 
continue. Otherwise,
    +          // return op. We effect this comparison by counting the number 
of "leaf" fields in
    +          // each schemata, assuming the fields in [[prunedDataSchema]] 
are a subset of the fields
    +          // in dataSchema.
    +          if (countLeaves(dataSchema) > countLeaves(prunedDataSchema)) {
    +            val prunedParquetRelation =
    +              hadoopFsRelation.copy(dataSchema = 
prunedDataSchema)(hadoopFsRelation.sparkSession)
    +
    +            val prunedRelation = buildPrunedRelation(l, 
prunedParquetRelation)
    +            val projectionOverSchema = 
ProjectionOverSchema(prunedDataSchema)
    +
    +            // Construct a new target for our projection by rewriting and
    +            // including the original filters where available
    +            val projectionChild =
    +              if (filters.nonEmpty) {
    +                val projectedFilters = filters.map(_.transformDown {
    +                  case projectionOverSchema(expr) => expr
    +                })
    +                val newFilterCondition = projectedFilters.reduce(And)
    +                Filter(newFilterCondition, prunedRelation)
    +              } else {
    +                prunedRelation
    +              }
    +
    +            // Construct the new projections of our Project by
    +            // rewriting the original projections
    +            val newProjects = projects.map(_.transformDown {
    +              case projectionOverSchema(expr) => expr
    +            }).map { case expr: NamedExpression => expr }
    +
    +            if (log.isDebugEnabled) {
    +              logDebug(s"New 
projects:\n${newProjects.map(_.treeString).mkString("\n")}")
    +              logDebug(s"Pruned data 
schema:\n${prunedDataSchema.treeString}")
    +            }
    +
    +            Project(newProjects, projectionChild)
    +          } else {
    +            op
    +          }
    +        } else {
    +          op
    +        }
    +    }
    +
    +  private def buildPrunedDataSchema(fileDataSchema: StructType,
    +                                    requestedRootFields: Seq[RootField]) = 
{
    --- End diff --
    
    Will fix all this. Seems like this is an opportunity to catch some of this 
with static analysis to avoid wasting the reviewer's time. Is Scalastyle not 
configured to catch this stuff?



---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #21889: [SPARK-4502][SQL] Parquet nested column pruning -...

Reply via email to