dongjoon-hyun commented on a change in pull request #27728: [SPARK-25556][SPARK-17636][SPARK-31026][SQL][test-hive1.2] Nested Column Predicate Pushdown for Parquet URL: https://github.com/apache/spark/pull/27728#discussion_r387298806
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala ########## @@ -437,61 +437,74 @@ object DataSourceStrategy { } } + /** + * Find the column name of an expression that can be pushed down. + */ + private[sql] def pushDownColName(e: Expression): Option[String] = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper + def helper(e: Expression): Option[Seq[String]] = e match { + case a: Attribute => Some(Seq(a.name)) + case s: GetStructField => helper(s.child).map(_ :+ s.childSchema(s.ordinal).name) + case _ => None + } + helper(e).map(_.quoted) + } + private def translateLeafNodeFilter(predicate: Expression): Option[Filter] = predicate match { - case expressions.EqualTo(a: Attribute, Literal(v, t)) => - Some(sources.EqualTo(a.name, convertToScala(v, t))) - case expressions.EqualTo(Literal(v, t), a: Attribute) => - Some(sources.EqualTo(a.name, convertToScala(v, t))) - - case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) => - Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) - case expressions.EqualNullSafe(Literal(v, t), a: Attribute) => - Some(sources.EqualNullSafe(a.name, convertToScala(v, t))) - - case expressions.GreaterThan(a: Attribute, Literal(v, t)) => - Some(sources.GreaterThan(a.name, convertToScala(v, t))) - case expressions.GreaterThan(Literal(v, t), a: Attribute) => - Some(sources.LessThan(a.name, convertToScala(v, t))) - - case expressions.LessThan(a: Attribute, Literal(v, t)) => - Some(sources.LessThan(a.name, convertToScala(v, t))) - case expressions.LessThan(Literal(v, t), a: Attribute) => - Some(sources.GreaterThan(a.name, convertToScala(v, t))) - - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, t)) => - Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) - case expressions.GreaterThanOrEqual(Literal(v, t), a: Attribute) => - Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) - - case expressions.LessThanOrEqual(a: Attribute, Literal(v, t)) => - Some(sources.LessThanOrEqual(a.name, convertToScala(v, t))) - case expressions.LessThanOrEqual(Literal(v, t), a: Attribute) => - Some(sources.GreaterThanOrEqual(a.name, convertToScala(v, t))) - - case expressions.InSet(a: Attribute, set) => - val toScala = CatalystTypeConverters.createToScalaConverter(a.dataType) - Some(sources.In(a.name, set.toArray.map(toScala))) + case expressions.EqualTo(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.EqualTo(_, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.EqualTo(_, convertToScala(v, t))) + + case expressions.EqualNullSafe(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.EqualNullSafe(_, convertToScala(v, t))) + case expressions.EqualNullSafe(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.EqualNullSafe(_, convertToScala(v, t))) + + case expressions.GreaterThan(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.GreaterThan(_, convertToScala(v, t))) + case expressions.GreaterThan(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.LessThan(_, convertToScala(v, t))) + + case expressions.LessThan(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.LessThan(_, convertToScala(v, t))) + case expressions.LessThan(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.GreaterThan(_, convertToScala(v, t))) + + case expressions.GreaterThanOrEqual(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.GreaterThanOrEqual(_, convertToScala(v, t))) + case expressions.GreaterThanOrEqual(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.LessThanOrEqual(_, convertToScala(v, t))) + + case expressions.LessThanOrEqual(e: Expression, Literal(v, t)) => + pushDownColName(e).map(sources.LessThanOrEqual(_, convertToScala(v, t))) + case expressions.LessThanOrEqual(Literal(v, t), e: Expression) => + pushDownColName(e).map(sources.GreaterThanOrEqual(_, convertToScala(v, t))) + + case expressions.InSet(e: Expression, set) => + val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) + pushDownColName(e).map(sources.In(_, set.toArray.map(toScala))) Review comment: If you don't mind, can we rewrite this like the following to prevent potential minor regression? The above new code execute `CatalystTypeConverters.createToScalaConverter` for all expressions while the previous one only do for `Attribute`. ```scala pushDownColName(e).map { val toScala = CatalystTypeConverters.createToScalaConverter(e.dataType) sources.In(_, set.toArray.map(toScala)) } ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org